diff --git a/README.md b/README.md index b71c458..b4d2e7a 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,77 @@ -CUDA Stream Compaction +CIS 565 Project 2 - CUDA Stream Compaction ====================== +* Richard Lee +* Tested on: Windows 7, i7-3720QM @ 2.60GHz 8GB, GT 650M 4GB (Personal Computer) -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +## Performance analysis +Performance testing was done on each implementation across a range of array sizes, averaged across 100 iterations each. +![](images/scanPerformance.png) +![](images/compactPerformance.png) +Overall, the CPU implementation for both the scan and stream compaction algorithms far outperformed their GPU counterparts. This was most likely due to the fact that they were able to deal with the input array and access memory much more efficiently than the GPU. In addition, I was only able to run the algorithms on inputs up to 2^16 in size, due to hardware restrictions - if run on even larger inputs, the GPU may have been able to take advantage of the parallel algorithms and gain a computational advantage over the CPU implementations. -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +One performance bottleneck I encountered was memory, as I was unable to allocate enough memory for an array of size greater than 2^16 on the GPU. I also found that the work-efficient scan was less performant than the naive scan, which could have been due to the fact that the number of threads allocated was not adjusted at runtime based on the level of up-sweep and down-sweep, which would be an additional feature to implement. -### (TODO: Your README) +## Test Output +``` +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 7 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 803684 803691 ] +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 803630 803660 ] + passed +==== naive scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 803684 803691 ] + passed +==== naive scan, non-power-of-two ==== + passed +==== work-efficient scan, power-of-two ==== + passed +==== work-efficient scan, non-power-of-two ==== + passed +==== thrust scan, power-of-two ==== + passed +==== thrust scan, non-power-of-two ==== + passed -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +**************************** +** SCAN PERFORMANCE TESTS ** +**************************** +CPU POW SCAN TIME ELAPSED : 0.060004 milliseconds. +CPU NPOT SCAN TIME ELAPSED : 0.060004 milliseconds. +NAIVE POW SCAN TIME ELAPSED : 0.419879 milliseconds. +NAIVE NPOT SCAN TIME ELAPSED : 0.361572 milliseconds. +EFFICIENT POW SCAN TIME ELAPSED : 0.492805 milliseconds. +EFFICIENT NPOT SCAN TIME ELAPSED : 0.493135 milliseconds. +THRUST POW SCAN TIME ELAPSED : 1.0536 milliseconds. +THRUST NPOT SCAN TIME ELAPSED : 1.06989 milliseconds. +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + passed +==== work-efficient compact, non-power-of-two ==== + passed + +***************************************** +** STREAM COMPACTION PERFORMANCE TESTS ** +***************************************** +CPU COMPACT NOSCAN POW TIME ELAPSED : 0.230013 milliseconds. +CPU COMPACT NOSCAN NPOT TIME ELAPSED : 0.230013 milliseconds. +CPU COMPACT SCAN TIME ELAPSED : 0.390022 milliseconds. +EFFICIENT POW COMPACT TIME ELAPSED : 0.530948 milliseconds. +EFFICIENT NPOT COMPACT TIME ELAPSED : 0.533724 milliseconds. +``` \ No newline at end of file diff --git a/images/compactPerformance.png b/images/compactPerformance.png new file mode 100755 index 0000000..95ae7ed Binary files /dev/null and b/images/compactPerformance.png differ diff --git a/images/scanPerformance.png b/images/scanPerformance.png new file mode 100755 index 0000000..6134f62 Binary files /dev/null and b/images/scanPerformance.png differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..97952ea 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,9 +12,11 @@ #include #include #include "testing_helpers.hpp" +#include +#include int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 15; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; @@ -43,7 +45,7 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); @@ -76,6 +78,79 @@ int main(int argc, char* argv[]) { //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + printf("\n"); + printf("****************************\n"); + printf("** SCAN PERFORMANCE TESTS **\n"); + printf("****************************\n"); + uint32_t iterations = 100; + zeroArray(SIZE, c); + auto begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < iterations; ++i) + { + StreamCompaction::CPU::scan(SIZE, c, a); + } + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - begin).count(); + std::cout << "CPU POW SCAN TIME ELAPSED : " << (float)(duration / iterations) * 0.000001 << " milliseconds." << std::endl; + + zeroArray(SIZE, c); + begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < iterations; ++i) + { + StreamCompaction::CPU::scan(NPOT, c, a); + } + end = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end - begin).count(); + std::cout << "CPU NPOT SCAN TIME ELAPSED : " << (float)(duration / iterations) * 0.000001 << " milliseconds." << std::endl; + + zeroArray(SIZE, c); + float timer = 0.0f; + for (int i = 0; i < iterations; ++i) + { + timer += StreamCompaction::Naive::scan(SIZE, c, a); + } + std::cout << "NAIVE POW SCAN TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl; + + zeroArray(SIZE, c); + timer = 0.0f; + for (int i = 0; i < iterations; ++i) + { + timer += StreamCompaction::Naive::scan(NPOT, c, a); + } + std::cout << "NAIVE NPOT SCAN TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl; + + zeroArray(SIZE, c); + timer = 0.0f; + for (int i = 0; i < iterations; ++i) + { + timer += StreamCompaction::Efficient::scan(SIZE, c, a); + } + std::cout << "EFFICIENT POW SCAN TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl; + + zeroArray(SIZE, c); + timer = 0.0f; + for (int i = 0; i < iterations; ++i) + { + timer += StreamCompaction::Efficient::scan(NPOT, c, a); + } + std::cout << "EFFICIENT NPOT SCAN TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl; + + zeroArray(SIZE, c); + timer = 0.0f; + for (int i = 0; i < iterations; ++i) + { + timer += StreamCompaction::Thrust::scan(SIZE, c, a); + } + std::cout << "THRUST POW SCAN TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl; + + zeroArray(SIZE, c); + timer = 0.0f; + for (int i = 0; i < iterations; ++i) + { + timer += StreamCompaction::Thrust::scan(NPOT, c, a); + } + std::cout << "THRUST NPOT SCAN TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl; + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -120,4 +195,55 @@ int main(int argc, char* argv[]) { count = StreamCompaction::Efficient::compact(NPOT, c, a); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + printf("\n"); + printf("*****************************************\n"); + printf("** STREAM COMPACTION PERFORMANCE TESTS **\n"); + printf("*****************************************\n"); + + zeroArray(SIZE, c); + begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < iterations; ++i) + { + StreamCompaction::CPU::compactWithoutScan(SIZE, c, a); + } + end = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end - begin).count(); + std::cout << "CPU COMPACT NOSCAN POW TIME ELAPSED : " << (float)(duration / iterations) * 0.000001 << " milliseconds." << std::endl; + + zeroArray(SIZE, c); + begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < iterations; ++i) + { + StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + } + end = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end - begin).count(); + std::cout << "CPU COMPACT NOSCAN NPOT TIME ELAPSED : " << (float)(duration / iterations) * 0.000001 << " milliseconds." << std::endl; + + zeroArray(SIZE, c); + begin = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < iterations; ++i) + { + StreamCompaction::CPU::compactWithScan(SIZE, c, a); + } + end = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end - begin).count(); + std::cout << "CPU COMPACT SCAN TIME ELAPSED : " << (float)(duration / iterations) * 0.000001 << " milliseconds." << std::endl; + + zeroArray(SIZE, c); + timer = 0.0f; + for (int i = 0; i < iterations; ++i) + { + StreamCompaction::Efficient::compact(SIZE, c, a, &timer); + } + std::cout << "EFFICIENT POW COMPACT TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl; + + zeroArray(SIZE, c); + timer = 0.0f; + for (int i = 0; i < iterations; ++i) + { + StreamCompaction::Efficient::compact(NPOT, c, a, &timer); + } + std::cout << "EFFICIENT NPOT COMPACT TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl; } diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..df241ba 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,10 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + bools[index] = (idata[index] != 0); + } } /** @@ -32,7 +35,12 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < n) { + if (bools[index]) { + odata[indices[index]] = idata[index]; + } + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..e6edf3c 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,4 +1,5 @@ #include +#include #include "cpu.h" namespace StreamCompaction { @@ -8,8 +9,11 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + int sum = 0; + for (int i = 0; i < n; i++) { + odata[i] = sum; + sum += idata[i]; + } } /** @@ -18,8 +22,14 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int j = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[j] = idata[i]; + j++; + } + } + return j; } /** @@ -28,8 +38,26 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + // Map elements to boolean array + std::vector bools(n); + for (int i = 0; i < n; i++) { + bools[i] = (idata[i] != 0); + } + + // Perform exclusive scan on temp array + std::vector indices(n); + scan(n, indices.data(), bools.data()); + + // Scatter + int elementCount; + for (int i = 0; i < n; i++) { + if (bools[i]) { + odata[indices[i]] = idata[i]; + elementCount = indices[i] + 1; + } + } + + return elementCount; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..7c3796b 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,14 +6,74 @@ namespace StreamCompaction { namespace Efficient { -// TODO: __global__ +__global__ void upSweep(int n, int d, int *data, bool isRoot) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + + if (isRoot) { + data[n - 1] = 0; + } + else { + int prevOffset = d == 0 ? 1 : 2 << (d - 1); + int offset = prevOffset * 2; + + if (index % offset == 0) { + data[index + offset - 1] += data[index + prevOffset - 1]; + } + } +} + +__global__ void downSweep(int n, int d, int *data) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + int prevOffset = d == 0 ? 1 : 2 << (d - 1); + int offset = prevOffset * 2; + + if (index < n && index % offset == 0) { + int t = data[index + prevOffset - 1]; + data[index + prevOffset - 1] = data[index + offset - 1]; + data[index + offset - 1] += t; + } +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); +float scan(int n, int *odata, const int *idata) { + int blockSize = 128; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int nearestPow = 2 << (ilog2ceil(n) - 1); //assume n > 0 + + int* dev_data; + cudaMalloc((void**)&dev_data, nearestPow * sizeof(int)); + checkCUDAError("cudaMalloc dev_data failed!"); + cudaMemcpy(dev_data, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + // Up-sweep + int numLevels = ilog2ceil(nearestPow); + for (int d = 0; d < numLevels; d++) { + upSweep << > >(nearestPow, d, dev_data, d == (numLevels - 1)); + } + + //Down-sweep + for (int d = numLevels; d >= 0; d--) { + downSweep << > >(nearestPow, d, dev_data); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + cudaMemcpy(odata, dev_data, sizeof(int) * nearestPow, cudaMemcpyDeviceToHost); + + cudaFree(dev_data); + return milliseconds; } /** @@ -25,9 +85,81 @@ void scan(int n, int *odata, const int *idata) { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ -int compact(int n, int *odata, const int *idata) { - // TODO - return -1; +int compact(int n, int *odata, const int *idata, float* timer) { + int blockSize = 128; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int nearestPow = 2 << (ilog2ceil(n) - 1); //assume n > 0 + + int* dev_idata; + int* dev_odata; + int* dev_bools; + int* dev_indices; + int* indices; + + cudaMalloc((void**)&dev_idata, nearestPow * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + cudaMemset(dev_idata, 0, nearestPow); + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_odata, nearestPow * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMalloc((void**)&dev_bools, nearestPow * sizeof(int)); + checkCUDAError("cudaMalloc dev_bools failed!"); + + cudaMalloc((void**)&dev_indices, nearestPow * sizeof(int)); + checkCUDAError("cudaMalloc dev_indices failed!"); + indices = (int*)malloc(nearestPow * sizeof(int)); + + cudaEvent_t start, stop; + if (timer) { + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + } + + StreamCompaction::Common::kernMapToBoolean << > >(nearestPow, dev_bools, dev_idata); + cudaMemcpy(dev_indices, dev_bools, sizeof(int) * nearestPow, cudaMemcpyDeviceToDevice); + + // Up-sweep + int numLevels = ilog2ceil(nearestPow); + for (int d = 0; d < numLevels; d++) { + upSweep << > >(nearestPow, d, dev_indices, d == (numLevels - 1)); + } + + //Down-sweep + for (int d = numLevels; d >= 0; d--) { + downSweep << > >(nearestPow, d, dev_indices); + } + + StreamCompaction::Common::kernScatter << > >(nearestPow, dev_odata, dev_idata, dev_bools, dev_indices); + + if (timer) { + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + (*timer) += milliseconds; + } + + cudaMemcpy(indices, dev_indices, sizeof(int) * nearestPow, cudaMemcpyDeviceToHost); + int j = nearestPow - 1; + do { + j--; + } while (indices[j] == indices[j + 1]); + int compactLength = indices[j] + 1; + + cudaMemcpy(odata, dev_odata, sizeof(int) * compactLength, cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + free(indices); + + return compactLength; } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..9cb9a87 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,8 +2,8 @@ namespace StreamCompaction { namespace Efficient { - void scan(int n, int *odata, const int *idata); + float scan(int n, int *odata, const int *idata); - int compact(int n, int *odata, const int *idata); + int compact(int n, int *odata, const int *idata, float* timer = NULL); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..a71066e 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,19 +1,74 @@ #include #include +#include + #include "common.h" #include "naive.h" namespace StreamCompaction { namespace Naive { -// TODO: __global__ +__global__ void sum(int n, int startIndex, int *odata, const int *idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) return; + if (index >= startIndex) { + odata[index] = idata[index - startIndex] + idata[index]; + } + else { + odata[index] = idata[index]; + } +} + +__global__ void inclusiveToExclusiveScan(int n, int *odata, const int *idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index < n) { + odata[index] = index == 0 ? 0 : idata[index - 1]; + } +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); +float scan(int n, int *odata, const int *idata) { + int blockSize = 128; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int* dev_idata; + int* dev_odata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_idata failed!"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAError("cudaMalloc dev_odata failed!"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + + int numLevels = ilog2ceil(n); + for (int startIndex = 1; startIndex <= (1 << (numLevels - 1)); startIndex *= 2) { + sum << > >(n, startIndex, dev_odata, dev_idata); + std::swap(dev_idata, dev_odata); + } + + inclusiveToExclusiveScan << > >(n, dev_odata, dev_idata); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + + return milliseconds; } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..7090b46 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,6 +2,6 @@ namespace StreamCompaction { namespace Naive { - void scan(int n, int *odata, const int *idata); + float scan(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..617e506 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -12,10 +12,24 @@ namespace Thrust { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); +float scan(int n, int *odata, const int *idata) { + thrust::device_vector dv_idata(idata, idata + n); + thrust::device_vector dv_odata(odata, odata + n); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + thrust::exclusive_scan(dv_idata.begin(), dv_idata.end(), dv_odata.begin()); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + thrust::copy(dv_odata.begin(), dv_odata.end(), odata); + + return milliseconds; } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index 06707f3..d6182b2 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -2,6 +2,6 @@ namespace StreamCompaction { namespace Thrust { - void scan(int n, int *odata, const int *idata); + float scan(int n, int *odata, const int *idata); } }