-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathscan_kernel.cuh
More file actions
125 lines (112 loc) · 4.4 KB
/
scan_kernel.cuh
File metadata and controls
125 lines (112 loc) · 4.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision: 5636 $
// $Date: 2009-07-02 13:39:38 +1000 (Thu, 02 Jul 2009) $
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* scan_kernel.cu
*
* @brief CUDPP kernel-level scan routines
*/
/** \defgroup cudpp_kernel CUDPP Kernel-Level API
* The CUDPP Kernel-Level API contains functions that run on the GPU
* device across a grid of Cooperative Thread Array (CTA, aka Thread
* Block). These kernels are declared \c __global__ so that they
* must be invoked from host (CPU) code. They generally invoke GPU
* \c __device__ routines in the CUDPP \link cudpp_cta CTA-Level API\endlink.
* Kernel-Level API functions are used by CUDPP
* \link cudpp_app Application-Level\endlink functions to implement their
* functionality.
* @{
*/
/** @name Scan Functions
* @{
*/
#include <scan_globals.h>
#include "scan_cta.cuh"
#include "scan_sharedmem.h"
/**
* @brief Main scan kernel
*
* This __global__ device function performs one level of a multiblock scan on
* an arbitrary-dimensioned array in \a d_in, returning the result in \a d_out
* (which may point to the same array). The same function may be used for
* single or multi-row scans. To perform a multirow scan, pass the width of
* each row of the input row (in elements) in \a dataRowPitch, and the width of
* the rows of \a d_blockSums (in elements) in \a blockSumRowPitch, and invoke
* with a thread block grid with height greater than 1.
*
* This function peforms one level of a recursive, multiblock scan. At the
* app level, this function is called by cudppScan and cudppMultiScan and used
* in combination with vectorAddUniform4() to produce a complete scan.
*
* Template parameter \a T is the datatype of the array to be scanned.
* Template parameter \a traits is the ScanTraits struct containing
* compile-time options for the scan, such as whether it is forward or
* backward, exclusive or inclusive, multi- or single-row, etc.
*
* @param[out] d_out The output (scanned) array
* @param[in] d_in The input array to be scanned
* @param[out] d_blockSums The array of per-block sums
* @param[in] numElements The number of elements to scan
* @param[in] dataRowPitch The width of each row of \a d_in in elements
* (for multi-row scans)
* @param[in] blockSumRowPitch The with of each row of \a d_blockSums in elements
* (for multi-row scans)
*/
template<class T, class traits>
__global__ void scan4(T *d_out,
const T *d_in,
T *d_blockSums,
int numElements,
unsigned int dataRowPitch,
unsigned int blockSumRowPitch)
{
SharedMemory<T> smem;
T* temp = smem.getPointer();
int devOffset, ai, bi, aiDev, biDev;
//T threadScan0[4], threadScan1[4];
#if 0
T threadScan[2][2];
#else
T threadScan[2][4];
#endif
unsigned int blockN = numElements;
unsigned int blockSumIndex = blockIdx.x;
if (traits::isMultiRow())
{
int yIndex = __umul24(blockDim.y, blockIdx.y) + threadIdx.y;
devOffset = __umul24(dataRowPitch, yIndex);
blockN += (devOffset << 2);
devOffset += __umul24(blockIdx.x, blockDim.x << 1);
blockSumIndex += __umul24(blockSumRowPitch << 2, yIndex) ;
}
else
{
devOffset = blockIdx.x * (blockDim.x << 1);
}
// load data into shared memory
#if 0
loadSharedChunkFromMem2<T, traits>
(temp, threadScan, d_in, blockN, devOffset, ai, bi, aiDev, biDev);
#else
loadSharedChunkFromMem4<T, traits>
(temp, threadScan, d_in, blockN, devOffset, ai, bi, aiDev, biDev);
#endif
scanCTA<T, traits>(temp, d_blockSums, blockSumIndex);
// write results to device memory
#if 0
storeSharedChunkToMem2<T, traits>
(d_out, threadScan, temp, blockN, devOffset, ai, bi, aiDev, biDev);
#else
storeSharedChunkToMem4<T, traits>
(d_out, threadScan, temp, blockN, devOffset, ai, bi, aiDev, biDev);
#endif
}
/** @} */ // end scan functions
/** @} */ // end cudpp_kernel