Spaces:

CVPR
/

LIVE

Runtime error

LIVE / thrust /dependencies /cub /experimental /defunct /example_coo_spmv.cu

Xu Ma

upload all files

28958dc almost 3 years ago

37.2 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* An implementation of COO SpMV using prefix scan to implement a
	* reduce-value-by-row strategy
	******************************************************************************/

	// Ensure printing of CUDA runtime errors to console
	#define CUB_STDERR

	#include <iterator>
	#include <vector>
	#include <string>
	#include <algorithm>
	#include <stdio.h>

	#include <cub/cub.cuh>

	#include "coo_graph.cuh"
	#include "../test/test_util.h"

	using namespace cub;
	using namespace std;


	/******************************************************************************
	* Globals, constants, and typedefs
	******************************************************************************/

	typedef int VertexId; // uint32s as vertex ids
	typedef double Value; // double-precision floating point values

	bool g_verbose = false;
	int g_timing_iterations = 1;
	CachingDeviceAllocator g_allocator;


	/******************************************************************************
	* Texture referencing
	******************************************************************************/

	/**
	* Templated texture reference type for multiplicand vector
	*/
	template <typename Value>
	struct TexVector
	{
	// Texture type to actually use (e.g., because CUDA doesn't load doubles as texture items)
	typedef typename If<(Equals<Value, double>::VALUE), uint2, Value>::Type CastType;

	// Texture reference type
	typedef texture<CastType, cudaTextureType1D, cudaReadModeElementType> TexRef;

	static TexRef ref;

	/**
	* Bind textures
	*/
	static void BindTexture(void *d_in, int elements)
	{
	cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<CastType>();
	if (d_in)
	{
	size_t offset;
	size_t bytes = sizeof(CastType) * elements;
	CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes));
	}
	}

	/**
	* Unbind textures
	*/
	static void UnbindTexture()
	{
	CubDebugExit(cudaUnbindTexture(ref));
	}

	/**
	* Load
	*/
	static __device__ __forceinline__ Value Load(int offset)
	{
	Value output;
	reinterpret_cast<typename TexVector<Value>::CastType &>(output) = tex1Dfetch(TexVector<Value>::ref, offset);
	return output;
	}
	};

	// Texture reference definitions
	template <typename Value>
	typename TexVector<Value>::TexRef TexVector<Value>::ref = 0;


	/******************************************************************************
	* Utility types
	******************************************************************************/


	/**
	* A partial dot-product sum paired with a corresponding row-id
	*/
	template <typename VertexId, typename Value>
	struct PartialProduct
	{
	VertexId row; /// Row-id
	Value partial; /// PartialProduct sum
	};


	/**
	* A partial dot-product sum paired with a corresponding row-id (specialized for double-int pairings)
	*/
	template <>
	struct PartialProduct<int, double>
	{
	long long row; /// Row-id
	double partial; /// PartialProduct sum
	};


	/**
	* Reduce-value-by-row scan operator
	*/
	struct ReduceByKeyOp
	{
	template <typename PartialProduct>
	__device__ __forceinline__ PartialProduct operator()(
	const PartialProduct &first,
	const PartialProduct &second)
	{
	PartialProduct retval;

	retval.partial = (second.row != first.row) ?
	second.partial :
	first.partial + second.partial;

	retval.row = second.row;
	return retval;
	}
	};


	/**
	* Stateful block-wide prefix operator for BlockScan
	*/
	template <typename PartialProduct>
	struct BlockPrefixCallbackOp
	{
	// Running block-wide prefix
	PartialProduct running_prefix;

	/**
	* Returns the block-wide running_prefix in thread-0
	*/
	__device__ __forceinline__ PartialProduct operator()(
	const PartialProduct &block_aggregate) ///< The aggregate sum of the BlockScan inputs
	{
	ReduceByKeyOp scan_op;

	PartialProduct retval = running_prefix;
	running_prefix = scan_op(running_prefix, block_aggregate);
	return retval;
	}
	};


	/**
	* Operator for detecting discontinuities in a list of row identifiers.
	*/
	struct NewRowOp
	{
	/// Returns true if row_b is the start of a new row
	template <typename VertexId>
	__device__ __forceinline__ bool operator()(
	const VertexId& row_a,
	const VertexId& row_b)
	{
	return (row_a != row_b);
	}
	};



	/******************************************************************************
	* Persistent thread block types
	******************************************************************************/

	/**
	* SpMV thread block abstraction for processing a contiguous segment of
	* sparse COO tiles.
	*/
	template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	typename VertexId,
	typename Value>
	struct PersistentBlockSpmv
	{
	//---------------------------------------------------------------------
	// Types and constants
	//---------------------------------------------------------------------

	// Constants
	enum
	{
	TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
	};

	// Head flag type
	typedef int HeadFlag;

	// Partial dot product type
	typedef PartialProduct<VertexId, Value> PartialProduct;

	// Parameterized BlockScan type for reduce-value-by-row scan
	typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;

	// Parameterized BlockExchange type for exchanging rows between warp-striped -> blocked arrangements
	typedef BlockExchange<VertexId, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeRows;

	// Parameterized BlockExchange type for exchanging values between warp-striped -> blocked arrangements
	typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeValues;

	// Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
	typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;

	// Shared memory type for this thread block
	struct TempStorage
	{
	union
	{
	typename BlockExchangeRows::TempStorage exchange_rows; // Smem needed for BlockExchangeRows
	typename BlockExchangeValues::TempStorage exchange_values; // Smem needed for BlockExchangeValues
	struct
	{
	typename BlockScan::TempStorage scan; // Smem needed for BlockScan
	typename BlockDiscontinuity::TempStorage discontinuity; // Smem needed for BlockDiscontinuity
	};
	};

	VertexId first_block_row; ///< The first row-ID seen by this thread block
	VertexId last_block_row; ///< The last row-ID seen by this thread block
	Value first_product; ///< The first dot-product written by this thread block
	};

	//---------------------------------------------------------------------
	// Thread fields
	//---------------------------------------------------------------------

	TempStorage &temp_storage;
	BlockPrefixCallbackOp<PartialProduct> prefix_op;
	VertexId *d_rows;
	VertexId *d_columns;
	Value *d_values;
	Value *d_vector;
	Value *d_result;
	PartialProduct *d_block_partials;
	int block_offset;
	int block_end;


	//---------------------------------------------------------------------
	// Operations
	//---------------------------------------------------------------------

	/**
	* Constructor
	*/
	__device__ __forceinline__
	PersistentBlockSpmv(
	TempStorage &temp_storage,
	VertexId *d_rows,
	VertexId *d_columns,
	Value *d_values,
	Value *d_vector,
	Value *d_result,
	PartialProduct *d_block_partials,
	int block_offset,
	int block_end)
	:
	temp_storage(temp_storage),
	d_rows(d_rows),
	d_columns(d_columns),
	d_values(d_values),
	d_vector(d_vector),
	d_result(d_result),
	d_block_partials(d_block_partials),
	block_offset(block_offset),
	block_end(block_end)
	{
	// Initialize scalar shared memory values
	if (threadIdx.x == 0)
	{
	VertexId first_block_row = d_rows[block_offset];
	VertexId last_block_row = d_rows[block_end - 1];

	temp_storage.first_block_row = first_block_row;
	temp_storage.last_block_row = last_block_row;
	temp_storage.first_product = Value(0);

	// Initialize prefix_op to identity
	prefix_op.running_prefix.row = first_block_row;
	prefix_op.running_prefix.partial = Value(0);
	}

	__syncthreads();
	}


	/**
	* Processes a COO input tile of edges, outputting dot products for each row
	*/
	template <bool FULL_TILE>
	__device__ __forceinline__ void ProcessTile(
	int block_offset,
	int guarded_items = 0)
	{
	VertexId columns[ITEMS_PER_THREAD];
	VertexId rows[ITEMS_PER_THREAD];
	Value values[ITEMS_PER_THREAD];
	PartialProduct partial_sums[ITEMS_PER_THREAD];
	HeadFlag head_flags[ITEMS_PER_THREAD];

	// Load a thread block-striped tile of A (sparse row-ids, column-ids, and values)
	if (FULL_TILE)
	{
	// Unguarded loads
	LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns);
	LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values);
	LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows);
	}
	else
	{
	// This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last
	// vertex for out-of-bound items, but zero-valued
	LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns, guarded_items, VertexId(0));
	LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values, guarded_items, Value(0));
	LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows, guarded_items, temp_storage.last_block_row);
	}

	// Load the referenced values from x and compute the dot product partials sums
	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
	{
	#if CUB_PTX_ARCH >= 350
	values[ITEM] *= ThreadLoad<LOAD_LDG>(d_vector + columns[ITEM]);
	#else
	values[ITEM] *= TexVector<Value>::Load(columns[ITEM]);
	#endif
	}

	// Transpose from warp-striped to blocked arrangement
	BlockExchangeValues(temp_storage.exchange_values).WarpStripedToBlocked(values);

	__syncthreads();

	// Transpose from warp-striped to blocked arrangement
	BlockExchangeRows(temp_storage.exchange_rows).WarpStripedToBlocked(rows);

	// Barrier for smem reuse and coherence
	__syncthreads();

	// FlagT row heads by looking for discontinuities
	BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
	head_flags, // (Out) Head flags
	rows, // Original row ids
	NewRowOp(), // Functor for detecting start of new rows
	prefix_op.running_prefix.row); // Last row ID from previous tile to compare with first row ID in this tile

	// Assemble partial product structures
	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
	{
	partial_sums[ITEM].partial = values[ITEM];
	partial_sums[ITEM].row = rows[ITEM];
	}

	// Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
	PartialProduct block_aggregate;
	BlockScan(temp_storage.scan).ExclusiveScan(
	partial_sums, // Scan input
	partial_sums, // Scan output
	ReduceByKeyOp(), // Scan operator
	block_aggregate, // Block-wide total (unused)
	prefix_op); // Prefix operator for seeding the block-wide scan with the running total

	// Barrier for smem reuse and coherence
	__syncthreads();

	// Scatter an accumulated dot product if it is the head of a valid row
	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
	{
	if (head_flags[ITEM])
	{
	d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;

	// Save off the first partial product that this thread block will scatter
	if (partial_sums[ITEM].row == temp_storage.first_block_row)
	{
	temp_storage.first_product = partial_sums[ITEM].partial;
	}
	}
	}
	}


	/**
	* Iterate over input tiles belonging to this thread block
	*/
	__device__ __forceinline__
	void ProcessTiles()
	{
	// Process full tiles
	while (block_offset <= block_end - TILE_ITEMS)
	{
	ProcessTile<true>(block_offset);
	block_offset += TILE_ITEMS;
	}

	// Process the last, partially-full tile (if present)
	int guarded_items = block_end - block_offset;
	if (guarded_items)
	{
	ProcessTile<false>(block_offset, guarded_items);
	}

	if (threadIdx.x == 0)
	{
	if (gridDim.x == 1)
	{
	// Scatter the final aggregate (this kernel contains only 1 thread block)
	d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
	}
	else
	{
	// Write the first and last partial products from this thread block so
	// that they can be subsequently "fixed up" in the next kernel.

	PartialProduct first_product;
	first_product.row = temp_storage.first_block_row;
	first_product.partial = temp_storage.first_product;

	d_block_partials[blockIdx.x * 2] = first_product;
	d_block_partials[(blockIdx.x * 2) + 1] = prefix_op.running_prefix;
	}
	}
	}
	};


	/**
	* Threadblock abstraction for "fixing up" an array of interblock SpMV partial products.
	*/
	template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	typename VertexId,
	typename Value>
	struct FinalizeSpmvBlock
	{
	//---------------------------------------------------------------------
	// Types and constants
	//---------------------------------------------------------------------

	// Constants
	enum
	{
	TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
	};

	// Head flag type
	typedef int HeadFlag;

	// Partial dot product type
	typedef PartialProduct<VertexId, Value> PartialProduct;

	// Parameterized BlockScan type for reduce-value-by-row scan
	typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;

	// Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
	typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;

	// Shared memory type for this thread block
	struct TempStorage
	{
	typename BlockScan::TempStorage scan; // Smem needed for reduce-value-by-row scan
	typename BlockDiscontinuity::TempStorage discontinuity; // Smem needed for head-flagging

	VertexId last_block_row;
	};


	//---------------------------------------------------------------------
	// Thread fields
	//---------------------------------------------------------------------

	TempStorage &temp_storage;
	BlockPrefixCallbackOp<PartialProduct> prefix_op;
	Value *d_result;
	PartialProduct *d_block_partials;
	int num_partials;


	//---------------------------------------------------------------------
	// Operations
	//---------------------------------------------------------------------

	/**
	* Constructor
	*/
	__device__ __forceinline__
	FinalizeSpmvBlock(
	TempStorage &temp_storage,
	Value *d_result,
	PartialProduct *d_block_partials,
	int num_partials)
	:
	temp_storage(temp_storage),
	d_result(d_result),
	d_block_partials(d_block_partials),
	num_partials(num_partials)
	{
	// Initialize scalar shared memory values
	if (threadIdx.x == 0)
	{
	VertexId first_block_row = d_block_partials[0].row;
	VertexId last_block_row = d_block_partials[num_partials - 1].row;
	temp_storage.last_block_row = last_block_row;

	// Initialize prefix_op to identity
	prefix_op.running_prefix.row = first_block_row;
	prefix_op.running_prefix.partial = Value(0);
	}

	__syncthreads();
	}


	/**
	* Processes a COO input tile of edges, outputting dot products for each row
	*/
	template <bool FULL_TILE>
	__device__ __forceinline__
	void ProcessTile(
	int block_offset,
	int guarded_items = 0)
	{
	VertexId rows[ITEMS_PER_THREAD];
	PartialProduct partial_sums[ITEMS_PER_THREAD];
	HeadFlag head_flags[ITEMS_PER_THREAD];

	// Load a tile of block partials from previous kernel
	if (FULL_TILE)
	{
	// Full tile
	#if CUB_PTX_ARCH >= 350
	LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums);
	#else
	LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums);
	#endif
	}
	else
	{
	// Partial tile (extend zero-valued coordinates of the last partial-product for out-of-bounds items)
	PartialProduct default_sum;
	default_sum.row = temp_storage.last_block_row;
	default_sum.partial = Value(0);

	#if CUB_PTX_ARCH >= 350
	LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
	#else
	LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
	#endif
	}

	// Copy out row IDs for row-head flagging
	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
	{
	rows[ITEM] = partial_sums[ITEM].row;
	}

	// FlagT row heads by looking for discontinuities
	BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
	rows, // Original row ids
	head_flags, // (Out) Head flags
	NewRowOp(), // Functor for detecting start of new rows
	prefix_op.running_prefix.row); // Last row ID from previous tile to compare with first row ID in this tile

	// Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
	PartialProduct block_aggregate;
	BlockScan(temp_storage.scan).ExclusiveScan(
	partial_sums, // Scan input
	partial_sums, // Scan output
	ReduceByKeyOp(), // Scan operator
	block_aggregate, // Block-wide total (unused)
	prefix_op); // Prefix operator for seeding the block-wide scan with the running total

	// Scatter an accumulated dot product if it is the head of a valid row
	#pragma unroll
	for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
	{
	if (head_flags[ITEM])
	{
	d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
	}
	}
	}


	/**
	* Iterate over input tiles belonging to this thread block
	*/
	__device__ __forceinline__
	void ProcessTiles()
	{
	// Process full tiles
	int block_offset = 0;
	while (block_offset <= num_partials - TILE_ITEMS)
	{
	ProcessTile<true>(block_offset);
	block_offset += TILE_ITEMS;
	}

	// Process final partial tile (if present)
	int guarded_items = num_partials - block_offset;
	if (guarded_items)
	{
	ProcessTile<false>(block_offset, guarded_items);
	}

	// Scatter the final aggregate (this kernel contains only 1 thread block)
	if (threadIdx.x == 0)
	{
	d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
	}
	}
	};


	/******************************************************************************
	* Kernel entrypoints
	******************************************************************************/



	/**
	* SpMV kernel whose thread blocks each process a contiguous segment of sparse COO tiles.
	*/
	template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	typename VertexId,
	typename Value>
	__launch_bounds__ (BLOCK_THREADS)
	__global__ void CooKernel(
	GridEvenShare<int> even_share,
	PartialProduct<VertexId, Value> *d_block_partials,
	VertexId *d_rows,
	VertexId *d_columns,
	Value *d_values,
	Value *d_vector,
	Value *d_result)
	{
	// Specialize SpMV thread block abstraction type
	typedef PersistentBlockSpmv<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> PersistentBlockSpmv;

	// Shared memory allocation
	__shared__ typename PersistentBlockSpmv::TempStorage temp_storage;

	// Initialize thread block even-share to tell us where to start and stop our tile-processing
	even_share.BlockInit();

	// Construct persistent thread block
	PersistentBlockSpmv persistent_block(
	temp_storage,
	d_rows,
	d_columns,
	d_values,
	d_vector,
	d_result,
	d_block_partials,
	even_share.block_offset,
	even_share.block_end);

	// Process input tiles
	persistent_block.ProcessTiles();
	}


	/**
	* Kernel for "fixing up" an array of interblock SpMV partial products.
	*/
	template <
	int BLOCK_THREADS,
	int ITEMS_PER_THREAD,
	typename VertexId,
	typename Value>
	__launch_bounds__ (BLOCK_THREADS, 1)
	__global__ void CooFinalizeKernel(
	PartialProduct<VertexId, Value> *d_block_partials,
	int num_partials,
	Value *d_result)
	{
	// Specialize "fix-up" thread block abstraction type
	typedef FinalizeSpmvBlock<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> FinalizeSpmvBlock;

	// Shared memory allocation
	__shared__ typename FinalizeSpmvBlock::TempStorage temp_storage;

	// Construct persistent thread block
	FinalizeSpmvBlock persistent_block(temp_storage, d_result, d_block_partials, num_partials);

	// Process input tiles
	persistent_block.ProcessTiles();
	}



	//---------------------------------------------------------------------
	// Host subroutines
	//---------------------------------------------------------------------


	/**
	* Simple test of device
	*/
	template <
	int COO_BLOCK_THREADS,
	int COO_ITEMS_PER_THREAD,
	int COO_SUBSCRIPTION_FACTOR,
	int FINALIZE_BLOCK_THREADS,
	int FINALIZE_ITEMS_PER_THREAD,
	typename VertexId,
	typename Value>
	void TestDevice(
	CooGraph<VertexId, Value>& coo_graph,
	Value* h_vector,
	Value* h_reference)
	{
	typedef PartialProduct<VertexId, Value> PartialProduct;

	const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD;

	// SOA device storage
	VertexId *d_rows; // SOA graph row coordinates
	VertexId *d_columns; // SOA graph col coordinates
	Value *d_values; // SOA graph values
	Value *d_vector; // Vector multiplicand
	Value *d_result; // Output row
	PartialProduct *d_block_partials; // Temporary storage for communicating dot product partials between thread blocks

	// Create SOA version of coo_graph on host
	int num_edges = coo_graph.coo_tuples.size();
	VertexId *h_rows = new VertexId[num_edges];
	VertexId *h_columns = new VertexId[num_edges];
	Value *h_values = new Value[num_edges];
	for (int i = 0; i < num_edges; i++)
	{
	h_rows[i] = coo_graph.coo_tuples[i].row;
	h_columns[i] = coo_graph.coo_tuples[i].col;
	h_values[i] = coo_graph.coo_tuples[i].val;
	}

	// Get CUDA properties
	Device device_props;
	CubDebugExit(device_props.Init());

	// Determine launch configuration from kernel properties
	int coo_sm_occupancy;
	CubDebugExit(device_props.MaxSmOccupancy(
	coo_sm_occupancy,
	CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, VertexId, Value>,
	COO_BLOCK_THREADS));
	int max_coo_grid_size = device_props.sm_count * coo_sm_occupancy * COO_SUBSCRIPTION_FACTOR;

	// Construct an even-share work distribution
	GridEvenShare<int> even_share(num_edges, max_coo_grid_size, COO_TILE_SIZE);
	int coo_grid_size = even_share.grid_size;
	int num_partials = coo_grid_size * 2;

	// Allocate COO device arrays
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_rows, sizeof(VertexId) num_edges));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_columns, sizeof(VertexId) num_edges));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_values, sizeof(Value) num_edges));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_vector, sizeof(Value) coo_graph.col_dim));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_result, sizeof(Value) coo_graph.row_dim));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_block_partials, sizeof(PartialProduct) num_partials));

	// Copy host arrays to device
	CubDebugExit(cudaMemcpy(d_rows, h_rows, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice));
	CubDebugExit(cudaMemcpy(d_columns, h_columns, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice));
	CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_edges, cudaMemcpyHostToDevice));
	CubDebugExit(cudaMemcpy(d_vector, h_vector, sizeof(Value) * coo_graph.col_dim, cudaMemcpyHostToDevice));

	// Bind textures
	TexVector<Value>::BindTexture(d_vector, coo_graph.col_dim);

	// Print debug info
	printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n",
	COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_sm_occupancy);
	if (coo_grid_size > 1)
	{
	printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS);
	}
	fflush(stdout);

	CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));

	// Run kernel (always run one iteration without timing)
	GpuTimer gpu_timer;
	float elapsed_millis = 0.0;
	for (int i = 0; i <= g_timing_iterations; i++)
	{
	gpu_timer.Start();

	// Initialize output
	CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));

	// Run the COO kernel
	CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
	even_share,
	d_block_partials,
	d_rows,
	d_columns,
	d_values,
	d_vector,
	d_result);

	if (coo_grid_size > 1)
	{
	// Run the COO finalize kernel
	CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
	d_block_partials,
	num_partials,
	d_result);
	}

	gpu_timer.Stop();

	if (i > 0)
	elapsed_millis += gpu_timer.ElapsedMillis();
	}

	// Force any kernel stdio to screen
	CubDebugExit(cudaThreadSynchronize());
	fflush(stdout);

	// Display timing
	if (g_timing_iterations > 0)
	{
	float avg_elapsed = elapsed_millis / g_timing_iterations;
	int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
	printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n",
	g_timing_iterations,
	avg_elapsed,
	total_bytes / avg_elapsed / 1000.0 / 1000.0,
	num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
	}

	// Check results
	int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
	printf("%s\n", compare ? "FAIL" : "PASS");
	AssertEquals(0, compare);

	// Cleanup
	TexVector<Value>::UnbindTexture();
	CubDebugExit(g_allocator.DeviceFree(d_block_partials));
	CubDebugExit(g_allocator.DeviceFree(d_rows));
	CubDebugExit(g_allocator.DeviceFree(d_columns));
	CubDebugExit(g_allocator.DeviceFree(d_values));
	CubDebugExit(g_allocator.DeviceFree(d_vector));
	CubDebugExit(g_allocator.DeviceFree(d_result));
	delete[] h_rows;
	delete[] h_columns;
	delete[] h_values;
	}


	/**
	* Compute reference answer on CPU
	*/
	template <typename VertexId, typename Value>
	void ComputeReference(
	CooGraph<VertexId, Value>& coo_graph,
	Value* h_vector,
	Value* h_reference)
	{
	for (VertexId i = 0; i < coo_graph.row_dim; i++)
	{
	h_reference[i] = 0.0;
	}

	for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++)
	{
	h_reference[coo_graph.coo_tuples[i].row] +=
	coo_graph.coo_tuples[i].val *
	h_vector[coo_graph.coo_tuples[i].col];
	}
	}


	/**
	* Assign arbitrary values to vector items
	*/
	template <typename Value>
	void AssignVectorValues(Value *vector, int col_dim)
	{
	for (int i = 0; i < col_dim; i++)
	{
	vector[i] = 1.0;
	}
	}


	/**
	* Main
	*/
	int main(int argc, char** argv)
	{
	// Initialize command line
	CommandLineArgs args(argc, argv);
	g_verbose = args.CheckCmdLineFlag("v");
	args.GetCmdLineArgument("i", g_timing_iterations);

	// Print usage
	if (args.CheckCmdLineFlag("help"))
	{
	printf("%s\n [--device=<device-id>] [--v] [--iterations=<test iterations>] [--grid-size=<grid-size>]\n"
	"\t--type=wheel --spokes=<spokes>\n"
	"\t--type=grid2d --width=<width> [--no-self-loops]\n"
	"\t--type=grid3d --width=<width> [--no-self-loops]\n"
	"\t--type=market --file=<file>\n"
	"\n", argv[0]);
	exit(0);
	}

	// Initialize device
	CubDebugExit(args.DeviceInit());

	// Get graph type
	string type;
	args.GetCmdLineArgument("type", type);

	// Generate graph structure

	CpuTimer timer;
	timer.Start();
	CooGraph<VertexId, Value> coo_graph;
	if (type == string("grid2d"))
	{
	VertexId width;
	args.GetCmdLineArgument("width", width);
	bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
	printf("Generating %s grid2d width(%d)... ", (self_loops) ? "5-pt" : "4-pt", width); fflush(stdout);
	if (coo_graph.InitGrid2d(width, self_loops)) exit(1);
	} else if (type == string("grid3d"))
	{
	VertexId width;
	args.GetCmdLineArgument("width", width);
	bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
	printf("Generating %s grid3d width(%d)... ", (self_loops) ? "7-pt" : "6-pt", width); fflush(stdout);
	if (coo_graph.InitGrid3d(width, self_loops)) exit(1);
	}
	else if (type == string("wheel"))
	{
	VertexId spokes;
	args.GetCmdLineArgument("spokes", spokes);
	printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout);
	if (coo_graph.InitWheel(spokes)) exit(1);
	}
	else if (type == string("market"))
	{
	string filename;
	args.GetCmdLineArgument("file", filename);
	printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout);
	if (coo_graph.InitMarket(filename)) exit(1);
	}
	else
	{
	printf("Unsupported graph type\n");
	exit(1);
	}
	timer.Stop();
	printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n",
	timer.ElapsedMillis() / 1000.0,
	coo_graph.coo_tuples.size(),
	coo_graph.row_dim,
	coo_graph.col_dim);
	fflush(stdout);

	if (g_verbose)
	{
	cout << coo_graph << "\n";
	}

	// Create vector
	Value *h_vector = new Value[coo_graph.col_dim];
	AssignVectorValues(h_vector, coo_graph.col_dim);
	if (g_verbose)
	{
	printf("Vector[%d]: ", coo_graph.col_dim);
	DisplayResults(h_vector, coo_graph.col_dim);
	printf("\n\n");
	}

	// Compute reference answer
	Value *h_reference = new Value[coo_graph.row_dim];
	ComputeReference(coo_graph, h_vector, h_reference);
	if (g_verbose)
	{
	printf("Results[%d]: ", coo_graph.row_dim);
	DisplayResults(h_reference, coo_graph.row_dim);
	printf("\n\n");
	}

	// Parameterization for SM35
	enum
	{
	COO_BLOCK_THREADS = 64,
	COO_ITEMS_PER_THREAD = 10,
	COO_SUBSCRIPTION_FACTOR = 4,
	FINALIZE_BLOCK_THREADS = 256,
	FINALIZE_ITEMS_PER_THREAD = 4,
	};

	// Run GPU version
	TestDevice<
	COO_BLOCK_THREADS,
	COO_ITEMS_PER_THREAD,
	COO_SUBSCRIPTION_FACTOR,
	FINALIZE_BLOCK_THREADS,
	FINALIZE_ITEMS_PER_THREAD>(coo_graph, h_vector, h_reference);

	// Cleanup
	delete[] h_vector;
	delete[] h_reference;

	return 0;
	}