Xu Ma
upload all files
28958dc
/******************************************************************************
* Copyright (c) 2011, Duane Merrill. All rights reserved.
* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/
/******************************************************************************
* An implementation of COO SpMV using prefix scan to implement a
* reduce-value-by-row strategy
******************************************************************************/
// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR
#include <iterator>
#include <vector>
#include <string>
#include <algorithm>
#include <stdio.h>
#include <cub/cub.cuh>
#include "coo_graph.cuh"
#include "../test/test_util.h"
using namespace cub;
using namespace std;
/******************************************************************************
* Globals, constants, and typedefs
******************************************************************************/
typedef int VertexId; // uint32s as vertex ids
typedef double Value; // double-precision floating point values
bool g_verbose = false;
int g_timing_iterations = 1;
CachingDeviceAllocator g_allocator;
/******************************************************************************
* Texture referencing
******************************************************************************/
/**
* Templated texture reference type for multiplicand vector
*/
template <typename Value>
struct TexVector
{
// Texture type to actually use (e.g., because CUDA doesn't load doubles as texture items)
typedef typename If<(Equals<Value, double>::VALUE), uint2, Value>::Type CastType;
// Texture reference type
typedef texture<CastType, cudaTextureType1D, cudaReadModeElementType> TexRef;
static TexRef ref;
/**
* Bind textures
*/
static void BindTexture(void *d_in, int elements)
{
cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<CastType>();
if (d_in)
{
size_t offset;
size_t bytes = sizeof(CastType) * elements;
CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes));
}
}
/**
* Unbind textures
*/
static void UnbindTexture()
{
CubDebugExit(cudaUnbindTexture(ref));
}
/**
* Load
*/
static __device__ __forceinline__ Value Load(int offset)
{
Value output;
reinterpret_cast<typename TexVector<Value>::CastType &>(output) = tex1Dfetch(TexVector<Value>::ref, offset);
return output;
}
};
// Texture reference definitions
template <typename Value>
typename TexVector<Value>::TexRef TexVector<Value>::ref = 0;
/******************************************************************************
* Utility types
******************************************************************************/
/**
* A partial dot-product sum paired with a corresponding row-id
*/
template <typename VertexId, typename Value>
struct PartialProduct
{
VertexId row; /// Row-id
Value partial; /// PartialProduct sum
};
/**
* A partial dot-product sum paired with a corresponding row-id (specialized for double-int pairings)
*/
template <>
struct PartialProduct<int, double>
{
long long row; /// Row-id
double partial; /// PartialProduct sum
};
/**
* Reduce-value-by-row scan operator
*/
struct ReduceByKeyOp
{
template <typename PartialProduct>
__device__ __forceinline__ PartialProduct operator()(
const PartialProduct &first,
const PartialProduct &second)
{
PartialProduct retval;
retval.partial = (second.row != first.row) ?
second.partial :
first.partial + second.partial;
retval.row = second.row;
return retval;
}
};
/**
* Stateful block-wide prefix operator for BlockScan
*/
template <typename PartialProduct>
struct BlockPrefixCallbackOp
{
// Running block-wide prefix
PartialProduct running_prefix;
/**
* Returns the block-wide running_prefix in thread-0
*/
__device__ __forceinline__ PartialProduct operator()(
const PartialProduct &block_aggregate) ///< The aggregate sum of the BlockScan inputs
{
ReduceByKeyOp scan_op;
PartialProduct retval = running_prefix;
running_prefix = scan_op(running_prefix, block_aggregate);
return retval;
}
};
/**
* Operator for detecting discontinuities in a list of row identifiers.
*/
struct NewRowOp
{
/// Returns true if row_b is the start of a new row
template <typename VertexId>
__device__ __forceinline__ bool operator()(
const VertexId& row_a,
const VertexId& row_b)
{
return (row_a != row_b);
}
};
/******************************************************************************
* Persistent thread block types
******************************************************************************/
/**
* SpMV thread block abstraction for processing a contiguous segment of
* sparse COO tiles.
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
typename VertexId,
typename Value>
struct PersistentBlockSpmv
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// Constants
enum
{
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
};
// Head flag type
typedef int HeadFlag;
// Partial dot product type
typedef PartialProduct<VertexId, Value> PartialProduct;
// Parameterized BlockScan type for reduce-value-by-row scan
typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
// Parameterized BlockExchange type for exchanging rows between warp-striped -> blocked arrangements
typedef BlockExchange<VertexId, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeRows;
// Parameterized BlockExchange type for exchanging values between warp-striped -> blocked arrangements
typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeValues;
// Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
// Shared memory type for this thread block
struct TempStorage
{
union
{
typename BlockExchangeRows::TempStorage exchange_rows; // Smem needed for BlockExchangeRows
typename BlockExchangeValues::TempStorage exchange_values; // Smem needed for BlockExchangeValues
struct
{
typename BlockScan::TempStorage scan; // Smem needed for BlockScan
typename BlockDiscontinuity::TempStorage discontinuity; // Smem needed for BlockDiscontinuity
};
};
VertexId first_block_row; ///< The first row-ID seen by this thread block
VertexId last_block_row; ///< The last row-ID seen by this thread block
Value first_product; ///< The first dot-product written by this thread block
};
//---------------------------------------------------------------------
// Thread fields
//---------------------------------------------------------------------
TempStorage &temp_storage;
BlockPrefixCallbackOp<PartialProduct> prefix_op;
VertexId *d_rows;
VertexId *d_columns;
Value *d_values;
Value *d_vector;
Value *d_result;
PartialProduct *d_block_partials;
int block_offset;
int block_end;
//---------------------------------------------------------------------
// Operations
//---------------------------------------------------------------------
/**
* Constructor
*/
__device__ __forceinline__
PersistentBlockSpmv(
TempStorage &temp_storage,
VertexId *d_rows,
VertexId *d_columns,
Value *d_values,
Value *d_vector,
Value *d_result,
PartialProduct *d_block_partials,
int block_offset,
int block_end)
:
temp_storage(temp_storage),
d_rows(d_rows),
d_columns(d_columns),
d_values(d_values),
d_vector(d_vector),
d_result(d_result),
d_block_partials(d_block_partials),
block_offset(block_offset),
block_end(block_end)
{
// Initialize scalar shared memory values
if (threadIdx.x == 0)
{
VertexId first_block_row = d_rows[block_offset];
VertexId last_block_row = d_rows[block_end - 1];
temp_storage.first_block_row = first_block_row;
temp_storage.last_block_row = last_block_row;
temp_storage.first_product = Value(0);
// Initialize prefix_op to identity
prefix_op.running_prefix.row = first_block_row;
prefix_op.running_prefix.partial = Value(0);
}
__syncthreads();
}
/**
* Processes a COO input tile of edges, outputting dot products for each row
*/
template <bool FULL_TILE>
__device__ __forceinline__ void ProcessTile(
int block_offset,
int guarded_items = 0)
{
VertexId columns[ITEMS_PER_THREAD];
VertexId rows[ITEMS_PER_THREAD];
Value values[ITEMS_PER_THREAD];
PartialProduct partial_sums[ITEMS_PER_THREAD];
HeadFlag head_flags[ITEMS_PER_THREAD];
// Load a thread block-striped tile of A (sparse row-ids, column-ids, and values)
if (FULL_TILE)
{
// Unguarded loads
LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns);
LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values);
LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows);
}
else
{
// This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last
// vertex for out-of-bound items, but zero-valued
LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns, guarded_items, VertexId(0));
LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values, guarded_items, Value(0));
LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows, guarded_items, temp_storage.last_block_row);
}
// Load the referenced values from x and compute the dot product partials sums
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
#if CUB_PTX_ARCH >= 350
values[ITEM] *= ThreadLoad<LOAD_LDG>(d_vector + columns[ITEM]);
#else
values[ITEM] *= TexVector<Value>::Load(columns[ITEM]);
#endif
}
// Transpose from warp-striped to blocked arrangement
BlockExchangeValues(temp_storage.exchange_values).WarpStripedToBlocked(values);
__syncthreads();
// Transpose from warp-striped to blocked arrangement
BlockExchangeRows(temp_storage.exchange_rows).WarpStripedToBlocked(rows);
// Barrier for smem reuse and coherence
__syncthreads();
// FlagT row heads by looking for discontinuities
BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
head_flags, // (Out) Head flags
rows, // Original row ids
NewRowOp(), // Functor for detecting start of new rows
prefix_op.running_prefix.row); // Last row ID from previous tile to compare with first row ID in this tile
// Assemble partial product structures
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
partial_sums[ITEM].partial = values[ITEM];
partial_sums[ITEM].row = rows[ITEM];
}
// Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
PartialProduct block_aggregate;
BlockScan(temp_storage.scan).ExclusiveScan(
partial_sums, // Scan input
partial_sums, // Scan output
ReduceByKeyOp(), // Scan operator
block_aggregate, // Block-wide total (unused)
prefix_op); // Prefix operator for seeding the block-wide scan with the running total
// Barrier for smem reuse and coherence
__syncthreads();
// Scatter an accumulated dot product if it is the head of a valid row
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
if (head_flags[ITEM])
{
d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
// Save off the first partial product that this thread block will scatter
if (partial_sums[ITEM].row == temp_storage.first_block_row)
{
temp_storage.first_product = partial_sums[ITEM].partial;
}
}
}
}
/**
* Iterate over input tiles belonging to this thread block
*/
__device__ __forceinline__
void ProcessTiles()
{
// Process full tiles
while (block_offset <= block_end - TILE_ITEMS)
{
ProcessTile<true>(block_offset);
block_offset += TILE_ITEMS;
}
// Process the last, partially-full tile (if present)
int guarded_items = block_end - block_offset;
if (guarded_items)
{
ProcessTile<false>(block_offset, guarded_items);
}
if (threadIdx.x == 0)
{
if (gridDim.x == 1)
{
// Scatter the final aggregate (this kernel contains only 1 thread block)
d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
}
else
{
// Write the first and last partial products from this thread block so
// that they can be subsequently "fixed up" in the next kernel.
PartialProduct first_product;
first_product.row = temp_storage.first_block_row;
first_product.partial = temp_storage.first_product;
d_block_partials[blockIdx.x * 2] = first_product;
d_block_partials[(blockIdx.x * 2) + 1] = prefix_op.running_prefix;
}
}
}
};
/**
* Threadblock abstraction for "fixing up" an array of interblock SpMV partial products.
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
typename VertexId,
typename Value>
struct FinalizeSpmvBlock
{
//---------------------------------------------------------------------
// Types and constants
//---------------------------------------------------------------------
// Constants
enum
{
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
};
// Head flag type
typedef int HeadFlag;
// Partial dot product type
typedef PartialProduct<VertexId, Value> PartialProduct;
// Parameterized BlockScan type for reduce-value-by-row scan
typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
// Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
// Shared memory type for this thread block
struct TempStorage
{
typename BlockScan::TempStorage scan; // Smem needed for reduce-value-by-row scan
typename BlockDiscontinuity::TempStorage discontinuity; // Smem needed for head-flagging
VertexId last_block_row;
};
//---------------------------------------------------------------------
// Thread fields
//---------------------------------------------------------------------
TempStorage &temp_storage;
BlockPrefixCallbackOp<PartialProduct> prefix_op;
Value *d_result;
PartialProduct *d_block_partials;
int num_partials;
//---------------------------------------------------------------------
// Operations
//---------------------------------------------------------------------
/**
* Constructor
*/
__device__ __forceinline__
FinalizeSpmvBlock(
TempStorage &temp_storage,
Value *d_result,
PartialProduct *d_block_partials,
int num_partials)
:
temp_storage(temp_storage),
d_result(d_result),
d_block_partials(d_block_partials),
num_partials(num_partials)
{
// Initialize scalar shared memory values
if (threadIdx.x == 0)
{
VertexId first_block_row = d_block_partials[0].row;
VertexId last_block_row = d_block_partials[num_partials - 1].row;
temp_storage.last_block_row = last_block_row;
// Initialize prefix_op to identity
prefix_op.running_prefix.row = first_block_row;
prefix_op.running_prefix.partial = Value(0);
}
__syncthreads();
}
/**
* Processes a COO input tile of edges, outputting dot products for each row
*/
template <bool FULL_TILE>
__device__ __forceinline__
void ProcessTile(
int block_offset,
int guarded_items = 0)
{
VertexId rows[ITEMS_PER_THREAD];
PartialProduct partial_sums[ITEMS_PER_THREAD];
HeadFlag head_flags[ITEMS_PER_THREAD];
// Load a tile of block partials from previous kernel
if (FULL_TILE)
{
// Full tile
#if CUB_PTX_ARCH >= 350
LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums);
#else
LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums);
#endif
}
else
{
// Partial tile (extend zero-valued coordinates of the last partial-product for out-of-bounds items)
PartialProduct default_sum;
default_sum.row = temp_storage.last_block_row;
default_sum.partial = Value(0);
#if CUB_PTX_ARCH >= 350
LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
#else
LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
#endif
}
// Copy out row IDs for row-head flagging
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
rows[ITEM] = partial_sums[ITEM].row;
}
// FlagT row heads by looking for discontinuities
BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
rows, // Original row ids
head_flags, // (Out) Head flags
NewRowOp(), // Functor for detecting start of new rows
prefix_op.running_prefix.row); // Last row ID from previous tile to compare with first row ID in this tile
// Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
PartialProduct block_aggregate;
BlockScan(temp_storage.scan).ExclusiveScan(
partial_sums, // Scan input
partial_sums, // Scan output
ReduceByKeyOp(), // Scan operator
block_aggregate, // Block-wide total (unused)
prefix_op); // Prefix operator for seeding the block-wide scan with the running total
// Scatter an accumulated dot product if it is the head of a valid row
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
{
if (head_flags[ITEM])
{
d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
}
}
}
/**
* Iterate over input tiles belonging to this thread block
*/
__device__ __forceinline__
void ProcessTiles()
{
// Process full tiles
int block_offset = 0;
while (block_offset <= num_partials - TILE_ITEMS)
{
ProcessTile<true>(block_offset);
block_offset += TILE_ITEMS;
}
// Process final partial tile (if present)
int guarded_items = num_partials - block_offset;
if (guarded_items)
{
ProcessTile<false>(block_offset, guarded_items);
}
// Scatter the final aggregate (this kernel contains only 1 thread block)
if (threadIdx.x == 0)
{
d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
}
}
};
/******************************************************************************
* Kernel entrypoints
******************************************************************************/
/**
* SpMV kernel whose thread blocks each process a contiguous segment of sparse COO tiles.
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
typename VertexId,
typename Value>
__launch_bounds__ (BLOCK_THREADS)
__global__ void CooKernel(
GridEvenShare<int> even_share,
PartialProduct<VertexId, Value> *d_block_partials,
VertexId *d_rows,
VertexId *d_columns,
Value *d_values,
Value *d_vector,
Value *d_result)
{
// Specialize SpMV thread block abstraction type
typedef PersistentBlockSpmv<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> PersistentBlockSpmv;
// Shared memory allocation
__shared__ typename PersistentBlockSpmv::TempStorage temp_storage;
// Initialize thread block even-share to tell us where to start and stop our tile-processing
even_share.BlockInit();
// Construct persistent thread block
PersistentBlockSpmv persistent_block(
temp_storage,
d_rows,
d_columns,
d_values,
d_vector,
d_result,
d_block_partials,
even_share.block_offset,
even_share.block_end);
// Process input tiles
persistent_block.ProcessTiles();
}
/**
* Kernel for "fixing up" an array of interblock SpMV partial products.
*/
template <
int BLOCK_THREADS,
int ITEMS_PER_THREAD,
typename VertexId,
typename Value>
__launch_bounds__ (BLOCK_THREADS, 1)
__global__ void CooFinalizeKernel(
PartialProduct<VertexId, Value> *d_block_partials,
int num_partials,
Value *d_result)
{
// Specialize "fix-up" thread block abstraction type
typedef FinalizeSpmvBlock<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> FinalizeSpmvBlock;
// Shared memory allocation
__shared__ typename FinalizeSpmvBlock::TempStorage temp_storage;
// Construct persistent thread block
FinalizeSpmvBlock persistent_block(temp_storage, d_result, d_block_partials, num_partials);
// Process input tiles
persistent_block.ProcessTiles();
}
//---------------------------------------------------------------------
// Host subroutines
//---------------------------------------------------------------------
/**
* Simple test of device
*/
template <
int COO_BLOCK_THREADS,
int COO_ITEMS_PER_THREAD,
int COO_SUBSCRIPTION_FACTOR,
int FINALIZE_BLOCK_THREADS,
int FINALIZE_ITEMS_PER_THREAD,
typename VertexId,
typename Value>
void TestDevice(
CooGraph<VertexId, Value>& coo_graph,
Value* h_vector,
Value* h_reference)
{
typedef PartialProduct<VertexId, Value> PartialProduct;
const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD;
// SOA device storage
VertexId *d_rows; // SOA graph row coordinates
VertexId *d_columns; // SOA graph col coordinates
Value *d_values; // SOA graph values
Value *d_vector; // Vector multiplicand
Value *d_result; // Output row
PartialProduct *d_block_partials; // Temporary storage for communicating dot product partials between thread blocks
// Create SOA version of coo_graph on host
int num_edges = coo_graph.coo_tuples.size();
VertexId *h_rows = new VertexId[num_edges];
VertexId *h_columns = new VertexId[num_edges];
Value *h_values = new Value[num_edges];
for (int i = 0; i < num_edges; i++)
{
h_rows[i] = coo_graph.coo_tuples[i].row;
h_columns[i] = coo_graph.coo_tuples[i].col;
h_values[i] = coo_graph.coo_tuples[i].val;
}
// Get CUDA properties
Device device_props;
CubDebugExit(device_props.Init());
// Determine launch configuration from kernel properties
int coo_sm_occupancy;
CubDebugExit(device_props.MaxSmOccupancy(
coo_sm_occupancy,
CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, VertexId, Value>,
COO_BLOCK_THREADS));
int max_coo_grid_size = device_props.sm_count * coo_sm_occupancy * COO_SUBSCRIPTION_FACTOR;
// Construct an even-share work distribution
GridEvenShare<int> even_share(num_edges, max_coo_grid_size, COO_TILE_SIZE);
int coo_grid_size = even_share.grid_size;
int num_partials = coo_grid_size * 2;
// Allocate COO device arrays
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_rows, sizeof(VertexId) * num_edges));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_columns, sizeof(VertexId) * num_edges));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * num_edges));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_vector, sizeof(Value) * coo_graph.col_dim));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_result, sizeof(Value) * coo_graph.row_dim));
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_block_partials, sizeof(PartialProduct) * num_partials));
// Copy host arrays to device
CubDebugExit(cudaMemcpy(d_rows, h_rows, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_columns, h_columns, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_edges, cudaMemcpyHostToDevice));
CubDebugExit(cudaMemcpy(d_vector, h_vector, sizeof(Value) * coo_graph.col_dim, cudaMemcpyHostToDevice));
// Bind textures
TexVector<Value>::BindTexture(d_vector, coo_graph.col_dim);
// Print debug info
printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n",
COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_sm_occupancy);
if (coo_grid_size > 1)
{
printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS);
}
fflush(stdout);
CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
// Run kernel (always run one iteration without timing)
GpuTimer gpu_timer;
float elapsed_millis = 0.0;
for (int i = 0; i <= g_timing_iterations; i++)
{
gpu_timer.Start();
// Initialize output
CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));
// Run the COO kernel
CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
even_share,
d_block_partials,
d_rows,
d_columns,
d_values,
d_vector,
d_result);
if (coo_grid_size > 1)
{
// Run the COO finalize kernel
CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
d_block_partials,
num_partials,
d_result);
}
gpu_timer.Stop();
if (i > 0)
elapsed_millis += gpu_timer.ElapsedMillis();
}
// Force any kernel stdio to screen
CubDebugExit(cudaThreadSynchronize());
fflush(stdout);
// Display timing
if (g_timing_iterations > 0)
{
float avg_elapsed = elapsed_millis / g_timing_iterations;
int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n",
g_timing_iterations,
avg_elapsed,
total_bytes / avg_elapsed / 1000.0 / 1000.0,
num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
}
// Check results
int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
printf("%s\n", compare ? "FAIL" : "PASS");
AssertEquals(0, compare);
// Cleanup
TexVector<Value>::UnbindTexture();
CubDebugExit(g_allocator.DeviceFree(d_block_partials));
CubDebugExit(g_allocator.DeviceFree(d_rows));
CubDebugExit(g_allocator.DeviceFree(d_columns));
CubDebugExit(g_allocator.DeviceFree(d_values));
CubDebugExit(g_allocator.DeviceFree(d_vector));
CubDebugExit(g_allocator.DeviceFree(d_result));
delete[] h_rows;
delete[] h_columns;
delete[] h_values;
}
/**
* Compute reference answer on CPU
*/
template <typename VertexId, typename Value>
void ComputeReference(
CooGraph<VertexId, Value>& coo_graph,
Value* h_vector,
Value* h_reference)
{
for (VertexId i = 0; i < coo_graph.row_dim; i++)
{
h_reference[i] = 0.0;
}
for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++)
{
h_reference[coo_graph.coo_tuples[i].row] +=
coo_graph.coo_tuples[i].val *
h_vector[coo_graph.coo_tuples[i].col];
}
}
/**
* Assign arbitrary values to vector items
*/
template <typename Value>
void AssignVectorValues(Value *vector, int col_dim)
{
for (int i = 0; i < col_dim; i++)
{
vector[i] = 1.0;
}
}
/**
* Main
*/
int main(int argc, char** argv)
{
// Initialize command line
CommandLineArgs args(argc, argv);
g_verbose = args.CheckCmdLineFlag("v");
args.GetCmdLineArgument("i", g_timing_iterations);
// Print usage
if (args.CheckCmdLineFlag("help"))
{
printf("%s\n [--device=<device-id>] [--v] [--iterations=<test iterations>] [--grid-size=<grid-size>]\n"
"\t--type=wheel --spokes=<spokes>\n"
"\t--type=grid2d --width=<width> [--no-self-loops]\n"
"\t--type=grid3d --width=<width> [--no-self-loops]\n"
"\t--type=market --file=<file>\n"
"\n", argv[0]);
exit(0);
}
// Initialize device
CubDebugExit(args.DeviceInit());
// Get graph type
string type;
args.GetCmdLineArgument("type", type);
// Generate graph structure
CpuTimer timer;
timer.Start();
CooGraph<VertexId, Value> coo_graph;
if (type == string("grid2d"))
{
VertexId width;
args.GetCmdLineArgument("width", width);
bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
printf("Generating %s grid2d width(%d)... ", (self_loops) ? "5-pt" : "4-pt", width); fflush(stdout);
if (coo_graph.InitGrid2d(width, self_loops)) exit(1);
} else if (type == string("grid3d"))
{
VertexId width;
args.GetCmdLineArgument("width", width);
bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
printf("Generating %s grid3d width(%d)... ", (self_loops) ? "7-pt" : "6-pt", width); fflush(stdout);
if (coo_graph.InitGrid3d(width, self_loops)) exit(1);
}
else if (type == string("wheel"))
{
VertexId spokes;
args.GetCmdLineArgument("spokes", spokes);
printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout);
if (coo_graph.InitWheel(spokes)) exit(1);
}
else if (type == string("market"))
{
string filename;
args.GetCmdLineArgument("file", filename);
printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout);
if (coo_graph.InitMarket(filename)) exit(1);
}
else
{
printf("Unsupported graph type\n");
exit(1);
}
timer.Stop();
printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n",
timer.ElapsedMillis() / 1000.0,
coo_graph.coo_tuples.size(),
coo_graph.row_dim,
coo_graph.col_dim);
fflush(stdout);
if (g_verbose)
{
cout << coo_graph << "\n";
}
// Create vector
Value *h_vector = new Value[coo_graph.col_dim];
AssignVectorValues(h_vector, coo_graph.col_dim);
if (g_verbose)
{
printf("Vector[%d]: ", coo_graph.col_dim);
DisplayResults(h_vector, coo_graph.col_dim);
printf("\n\n");
}
// Compute reference answer
Value *h_reference = new Value[coo_graph.row_dim];
ComputeReference(coo_graph, h_vector, h_reference);
if (g_verbose)
{
printf("Results[%d]: ", coo_graph.row_dim);
DisplayResults(h_reference, coo_graph.row_dim);
printf("\n\n");
}
// Parameterization for SM35
enum
{
COO_BLOCK_THREADS = 64,
COO_ITEMS_PER_THREAD = 10,
COO_SUBSCRIPTION_FACTOR = 4,
FINALIZE_BLOCK_THREADS = 256,
FINALIZE_ITEMS_PER_THREAD = 4,
};
// Run GPU version
TestDevice<
COO_BLOCK_THREADS,
COO_ITEMS_PER_THREAD,
COO_SUBSCRIPTION_FACTOR,
FINALIZE_BLOCK_THREADS,
FINALIZE_ITEMS_PER_THREAD>(coo_graph, h_vector, h_reference);
// Cleanup
delete[] h_vector;
delete[] h_reference;
return 0;
}