/*
SAM v1.1: SAM is a fast prefix-scan template written in CUDA that
supports higher orders and/or tuple values as described in
http://cs.txstate.edu/~burtscher/papers/pldi16.pdf.
Copyright (c) 2016, Texas State University. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted for academic, research, experimental, or personal use provided
that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions, and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Texas State University nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
For all other uses, please contact the Office for Commercialization and Industry
Relations at Texas State University .
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Version 1.1 (2016/3/13):
- reduction in auxiliary memory usage
- simplified interface
- small performance optimizations
Authors: Sepideh Maleki, Annie Yang, and Martin Burtscher
*/
static const int SMs = 53; // this value must match the used GPU
static const int MOD = 256; // do not change
static const int MM1 = MOD - 1; // do not change
static const int Max_Dim = 32; // do not increase
template
static __global__ __launch_bounds__(1024, 2)
void kSAM(const T * const __restrict__ ginput, T * const __restrict__ goutput, const int items, volatile T * const __restrict__ gcarry, volatile int * const __restrict__ gwait)
{
/*
// The following assertions need to hold but are commented out for performance reasons.
assert(1024 == blockDim.x);
assert(SMs * 2 == gridDim.x);
assert(64 >= gridDim.x);
assert(Max_Dim >= dim);
*/
const int chunks = (items + (1024 * factor - 1)) / (1024 * factor);
const int tid = threadIdx.x;
const int warp = tid >> 5;
const int lane = tid & 31;
const int corr = 1024 % dim;
__shared__ T globcarry[dim][order];
__shared__ T tempcarry[dim];
__shared__ T sbuf[factor][32 * dim];
for (int i = tid; i < dim * order; i += 1024) {
globcarry[i / order][i % order] = 0;
}
int pos = 0;
for (int chunk = blockIdx.x; chunk < chunks; chunk += SMs * 2) {
const int offs = tid + chunk * (1024 * factor);
const int firstid = offs % dim;
const int lastid = (offs + 1024 * (factor - 1)) % dim;
T val[factor];
if (chunk < chunks - 1) {
for (int i = 0; i < factor; i++) {
val[i] = ginput[offs + 1024 * i];
}
} else {
for (int i = 0; i < factor; i++) {
val[i] = 0;
if (offs + 1024 * i < items) {
val[i] = ginput[offs + 1024 * i];
}
}
}
for (int round = 0; round < order; round++) {
for (int i = 0; i < factor; i++) {
for (int d = dim; d < 32; d *= 2) {
T tmp = __shfl_up(val[i], d);
if (lane >= d) val[i] = op(val[i], tmp);
}
}
if (lane >= (32 - dim)) {
const int tix = warp * dim;
int id = firstid;
for (int i = 0; i < factor; i++) {
sbuf[i][tix + id] = val[i];
id += corr;
if (id >= dim) id -= dim;
}
}
__syncthreads();
if (warp < dim) {
const int idx = (lane * dim) + warp;
for (int i = 0; i < factor; i++){
T v = sbuf[i][idx];
for (int d = 1; d < 32; d *= 2) {
T tmp = __shfl_up(v, d);
if (lane >= d) v = op(v, tmp);
}
sbuf[i][idx] = v;
}
}
__syncthreads();
if (warp > 0) {
const int tix = warp * dim - dim;
int id = firstid;
for (int i = 0; i < factor; i++) {
val[i] = op(val[i], sbuf[i][tix + id]);
id += corr;
if (id >= dim) id -= dim;
}
}
T carry[dim];
for (int d = 0; d < dim; d++) {
carry[d] = 0;
}
int id = firstid;
for (int i = 1; i < factor; i++) {
for (int d = 0; d < dim; d++) {
carry[d] = op(carry[d], sbuf[i - 1][31 * dim + d]);
}
id += corr;
if (id >= dim) id -= dim;
val[i] = op(val[i], carry[id]);
}
int wait = round + 1;
if (tid > 1023 - dim) {
gcarry[lastid * (order * MOD) + round * MOD + (chunk & MM1)] = val[factor - 1];
gwait[round * MOD + ((chunk + (MOD - 4 * SMs)) & MM1)] = 0;
__threadfence();
if (tid == 1023) {
gwait[round * MOD + (chunk & MM1)] = wait;
}
}
const int tidx = pos + tid;
if (tidx < chunk) {
wait = gwait[round * MOD + (tidx & MM1)];
}
while (__syncthreads_count(wait <= round) != 0) {
if (wait <= round) {
wait = gwait[round * MOD + (tidx & MM1)];
}
}
if (warp < dim) {
int posx = pos + lane;
T carry = 0;
if (posx < chunk) {
carry = gcarry[warp * (order * MOD) + round * MOD + (posx & MM1)];
}
if (SMs > 16) {
posx += 32;
if (posx < chunk) {
carry = op(carry, gcarry[warp * (order * MOD) + round * MOD + (posx & MM1)]);
}
}
for (int d = 1; d < 32; d *= 2) {
carry = op(carry, __shfl_up(carry, d));
}
if (lane == 31) {
T temp = op(globcarry[warp][round], carry);
tempcarry[warp] = globcarry[warp][round] = temp;
}
}
__syncthreads();
if (tid > 1023 - dim) {
globcarry[lastid][round] = op(globcarry[lastid][round], val[factor - 1]);
}
id = firstid;
for (int i = 0; i < factor; i++) {
val[i] = op(val[i], tempcarry[id]);
id += corr;
if (id >= dim) id -= dim;
}
} // round
if (chunk < chunks - 1) {
for (int i = 0; i < factor; i++) {
goutput[offs + 1024 * i] = val[i];
}
} else {
for (int i = 0; i < factor; i++) {
if (offs + 1024 * i < items) {
goutput[offs + 1024 * i] = val[i];
}
}
}
pos = chunk + 1;
} // chunk
}
template
static void rSAM(const T * const __restrict__ ginput, T * const __restrict__ goutput, const int items)
{
static int* aux = NULL;
if (aux == NULL) {
cudaMalloc(&aux, order * MOD * sizeof(int) + dim * order * MOD * sizeof(T));
}
cudaMemsetAsync(aux, 0, order * MOD * sizeof(int));
kSAM<<>>(ginput, goutput, items, (T *)&aux[order * MOD], aux);
}
/*
Below is the SAM function that should be called like this:
SAM(input, output, items);
Template parameters
-------------------
T: type of the elements (the code has only been tested with "int" and "long")
dim: tuple size (1 through 32, where 1 is a normal prefix scan)
order: requested order (at least 1, where 1 is a normal prefix scan)
op: associative operator (e.g., sum, max, xor)
For example, the sum operator would be specified as follows:
template
static __host__ __device__ T sum(T a, T b)
{
return a + b;
}
Inputs
------
items: number of elements in input and output arrays
input: array of values over which to perform the prefix scan (input must have "items" elements)
Output
------
output: result of the prefix scan (output must have capacity for "items" elements)
*/