diff --git a/README.md b/README.md index b71c458..5938fd3 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,214 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Jian Ru +* Tested on: Windows 10, i7-4850 @ 2.30GHz 16GB, GT 750M 2GB (Personal) -### (TODO: Your README) +### List of Features -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* Batch scan (faster than thrust implementation even when data size is as large as 2^26) +* CPU scan and stream compaction +* Naive GPU scan +* Work-efficient GPU scan and stream compaction (used shared memory) +* Thrust scan +* GPU radix sort (added radix_sort.h and radix_sort.cu) +* Custom tests +### Performance Analysis + + ![](img/sc_perf1.png) + +* Optimizing Block Sizes + * 128 or 256 turn out to be the best. Unless you use non-multiple of warp size, + really small (like 32), or really big (like 1024) block size, the performance difference + is not really that huge + * The only exception is work-efficient scan. Since reducing block size increases the number + of sub-sums and likely increases the number of recursions, it enjoys a larger block size + (512 turns out to be optimal on my machine). Even if so, using 1024 turns out to be slower. + It may due to reduced concurrent execution, since all the threads in a block need to be + synchtronized at 3 different places in the kernel. Increasing block size means fewer threads + can terminate and leave multiprocessor early. So it is likely to hurt performance. + +* Array Size vs. Execution Time (ms) + * Exclusive scan + + ![](img/scan_perf.png) + + | Array Size | Naive | Efficient | Thrust | CPU | Batch | + | ---------- | ----- | --------- | ------ | --- | ----- | + | 2^8 | 0.03 | 0.01 | 0.02 | 0.00 | 0.01 | + | 2^15 | 0.42 | 0.07 | 0.70 | 0.08 | 0.03 | + | 2^20 | 5.72 | 1.39 | 0.93 | 2.31 | 0.42 | + | 2^25 | 223.64 | 43.40 | 18.15 | 77.18 | 12.39 | + | 2^26 | 465.44 | 86.81 | 35.85 | 153.48 | 24.75 | + + * Stream compaction + + ![](img/sc_perf2.png) + + | Array Size | Efficient | CPU (no scan) | CPU (with scan) | Batch | + | --- | --- | --- | --- | --- | + | 2^8 | 0.09 | 0.00 | 0.00 | 0.11 | + | 2^15 | 0.16 | 0.09 | 0.20 | 0.15 | + | 2^20 | 3.78 | 2.70 | 6.03 | 1.61 | + | 2^25 | 72.91 | 85.98 | 196.08 | 42.37 | + | 2^26 | 143.49 | 174.08 | 391.90 | 81.53 | + + * Radix sort + + ![](img/rs_perf.png) + + | Array Size | Batch | Thrust | + | --- | --- | --- | + | 2^8 | 0.42 | 0.58 | + | 2^15 | 2.44 | 0.34 | + | 2^20 | 47.35 | 5.80 | + | 2^25 | 1468.98 | 116.92 | + +* Performance Bottlenecks + * CUDA runtime APIs for memory manipulation (e.g. cudaMalloc, cudaMemcpy, cudaFree) are super expensive + * Naive scan + * Excessive global memory access + * Too many kernel calls (each level of up/down-sweep require one kernel call) + * Work-efficient scan + * Too many threads and in efficient memory I/O + * Threads exceed the number of cores on GPU will queue up. Too many threads also increases + management and scheduling overhead + * Each thread only read in two 4-byte data and the two 4-byte are read in using two separate + instructions. GPU, due to its SIMD nature, is better in handling vector I/O. That is why + batch scan loads and stores a vec4 in each thread + * Too many kernel calls when array size is large + * In my implementation, if array size exceed 2 times block size, two more calls are + required: one scan the sub-sums and another one scatter scanned sub-sums to corresponding + blocks. If the size of sub-sum array is further greater than 2 times block size, another + two extra calls are generated and this recursive behavior will go on as array size gets + larger and larger. + * Thrust's scan implementation stablizes at 3 kernel calls despite of the size of data. + Thus even though my implementation is comparable or even slightly faster than thrust scan, + it becomes much slower when array size becomes really large (2^25 for example) + * Bank conflicts at the beginning but have been resolved now + * Algorithm not good enough (too much computation) when compared to thrust's implementation + bacause my scan kernel, on average, takes twice as much time to execute as thrust's kernel + * Thrust scan + * Additional cudaMalloc and cudaFree calls + * Judging from the performance analysis timeline, thrust is doing cudaMalloc and cudaFree + even if I pass in thrust::device_ptr. This causes thrust scan become slower than my + work-efficient scan when array size is small + * Work-effcient stream compaction + * My implementation is basically a wrapper on work-efficient scan plus two light-weight + kernels, kernMapToBoolean and kernScatter. So the bottlenecks are the same as work efficient + scan + * Self-implemented radix sort + * Excessive kernel calls + * Scan kernel implementation not good enough when compared with thrust's + * My implementation is super naive. I basically separate elements into 0 bin and 1 bin and + do this for every bit. So if the type has 32 bits, there will be 3 * 32 kernel invocations + or more when array size gets large. + * On the other hand, judging from performance analysis timeline, thrust's implementation + has 3 * 7 kernel invacations all the time and one of the kernel is super cheap (took less + than 10 microseconds to execute on 2^25 array size). Moreover, even the two more expensive + kernels run much faster than my scan kernel on large data size + +* Sample Output + +``` + +GeForce GT 750M [sm_30] +Array Size: 1048576 +Test group size: 100 +Note: + 1. Execution time is the average over @TEST_GROUP_SIZE times exections + 2. Runtime API memory operations were excluded from time measurement + +**************** +** RADIX SORT TESTS ** +**************** + [ 3 4 3 2 0 2 0 0 0 2 3 0 2 ... 1 0 ] +==== parallel radix sort, power-of-two ==== + Execution Time: 109.91ms + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 4 4 ] + passed +==== parallel radix sort, non power-of-two ==== + Execution Time: 113.13ms + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 4 4 ] + passed +==== thrust sort, power-of-two ==== + Execution Time: 3.55ms + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 4 4 ] + passed +==== thrust sort, non power-of-two ==== + Execution Time: 3.41ms + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 4 4 ] + passed + +**************** +** SCAN TESTS ** +**************** +==== cpu scan, power-of-two ==== + Execution Time: 2.32ms + [ 0 3 7 10 12 12 14 14 14 14 16 19 19 ... 2098448 2098449 ] +==== cpu scan, non-power-of-two ==== + Execution Time: 2.39ms + [ 0 3 7 10 12 12 14 14 14 14 16 19 19 ... 2098443 2098447 ] + passed +==== naive scan, power-of-two ==== + Execution Time: 5.73ms + [ 0 3 7 10 12 12 14 14 14 14 16 19 19 ... 2098448 2098449 ] + passed +==== naive scan, non-power-of-two ==== + Execution Time: 5.72ms + [ 0 3 7 10 12 12 14 14 14 14 16 19 19 ... 2098443 2098447 ] + passed +==== work-efficient scan, power-of-two ==== + Execution Time: 1.39ms + [ 0 3 7 10 12 12 14 14 14 14 16 19 19 ... 2098448 2098449 ] + passed +==== work-efficient scan, non-power-of-two ==== + Execution Time: 1.39ms + [ 0 3 7 10 12 12 14 14 14 14 16 19 19 ... 2098443 2098447 ] + passed +==== batch scan, power-of-two ==== + Execution Time: 0.42ms + [ 0 3 7 10 12 12 14 14 14 14 16 19 19 ... 2098448 2098449 ] + passed +==== batch scan, non-power-of-two ==== + Execution Time: 0.42ms + [ 0 3 7 10 12 12 14 14 14 14 16 19 19 ... 2098443 2098447 ] + passed +==== thrust scan, power-of-two ==== + Execution Time: 0.93ms + [ 0 3 7 10 12 12 14 14 14 14 16 19 19 ... 2098448 2098449 ] + passed +==== thrust scan, non-power-of-two ==== + Execution Time: 0.93ms + [ 0 3 7 10 12 12 14 14 14 14 16 19 19 ... 2098443 2098447 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + Execution Time: 2.97ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + Execution Time: 2.65ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +==== cpu compact with scan ==== + Execution Time: 5.90ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +==== work-efficient compact, power-of-two ==== + Execution Time: 2.57ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +==== work-efficient compact, non-power-of-two ==== + Execution Time: 2.57ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ] + passed +Press any key to continue . . . +``` + + diff --git a/img/rs_perf.png b/img/rs_perf.png new file mode 100644 index 0000000..66e6753 Binary files /dev/null and b/img/rs_perf.png differ diff --git a/img/sc_perf1.png b/img/sc_perf1.png new file mode 100644 index 0000000..c128ba5 Binary files /dev/null and b/img/sc_perf1.png differ diff --git a/img/sc_perf2.png b/img/sc_perf2.png new file mode 100644 index 0000000..689d8e4 Binary files /dev/null and b/img/sc_perf2.png differ diff --git a/img/scan_perf.png b/img/scan_perf.png new file mode 100644 index 0000000..fa9a251 Binary files /dev/null and b/img/scan_perf.png differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..38e555d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,74 +6,247 @@ * @copyright University of Pennsylvania */ + +#include +#include +#include #include +#define MEASURE_EXEC_TIME #include #include #include #include +#include #include "testing_helpers.hpp" + +#define TEST_GROUP_SIZE 100 + + int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 20; const int NPOT = SIZE - 3; - int a[SIZE], b[SIZE], c[SIZE]; + int *a = new int[SIZE]; + int *b = new int[SIZE]; + int *c = new int[SIZE]; - // Scan tests + printGPUInfo(0); + std::cout << "Array Size: " << SIZE << '\n' + << "Test group size: " << TEST_GROUP_SIZE << '\n' + << "Note:\n" + << " 1. Execution time is the average over @TEST_GROUP_SIZE times exections\n" + << " 2. Runtime API memory operations were excluded from time measurement\n"; + //system("pause"); + // Radix sort test printf("\n"); printf("****************\n"); - printf("** SCAN TESTS **\n"); + printf("** RADIX SORT TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + genArray(SIZE - 1, a, 5); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); + float accumExecTime = 0.f; + zeroArray(SIZE, b); + printDesc("parallel radix sort, power-of-two"); + try + { + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + accumExecTime += ParallelRadixSort::sort(SIZE, reinterpret_cast(b), reinterpret_cast(a), 0xffffffff); + if (i == 0) accumExecTime = 0.f; + } + } + catch (std::exception &e) + { + std::cout << " ParallelRadixSort::sort: Error: " << e.what() << std::endl; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(SIZE, b, true); + testSorted(SIZE, b); + + accumExecTime = 0.f; + zeroArray(SIZE, b); + printDesc("parallel radix sort, non power-of-two"); + try + { + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + accumExecTime += ParallelRadixSort::sort(NPOT, reinterpret_cast(b), reinterpret_cast(a), 0xffffffff); + if (i == 0) accumExecTime = 0.f; + } + } + catch (std::exception &e) + { + std::cout << " ParallelRadixSort::sort: Error: " << e.what() << std::endl; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(NPOT, b, true); + testSorted(NPOT, b); + + //system("pause"); + + accumExecTime = 0.f; + zeroArray(SIZE, b); + printDesc("thrust sort, power-of-two"); + try + { + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + accumExecTime += ParallelRadixSort::thrustSort(SIZE, reinterpret_cast(b), reinterpret_cast(a)); + if (i == 0) accumExecTime = 0.f; + } + } + catch (std::exception &e) + { + std::cout << " ParallelRadixSort::thrustSort: Error: " << e.what() << std::endl; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(SIZE, b, true); + testSorted(SIZE, b); + + accumExecTime = 0.f; + zeroArray(SIZE, b); + printDesc("thrust sort, non power-of-two"); + try + { + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + accumExecTime += ParallelRadixSort::thrustSort(NPOT, reinterpret_cast(b), reinterpret_cast(a)); + if (i == 0) accumExecTime = 0.f; + } + } + catch (std::exception &e) + { + std::cout << " ParallelRadixSort::thrustSort: Error: " << e.what() << std::endl; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(NPOT, b, true); + testSorted(NPOT, b); + + // Scan tests + printf("\n"); + printf("****************\n"); + printf("** SCAN TESTS **\n"); + printf("****************\n"); + zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); + clock_t start = clock(); + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + StreamCompaction::CPU::scan(SIZE, b, a); + clock_t end = clock(); + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << double(end - start) / CLOCKS_PER_SEC * 1000 / TEST_GROUP_SIZE << "ms\n"; printArray(SIZE, b, true); zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); + start = clock(); + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + StreamCompaction::CPU::scan(NPOT, c, a); + end = clock(); + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << double(end - start) / CLOCKS_PER_SEC * 1000 / TEST_GROUP_SIZE << "ms\n"; printArray(NPOT, b, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + accumExecTime = 0.f; + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + accumExecTime += StreamCompaction::Naive::scan(SIZE, c, a); + if (i == 0) accumExecTime = 0.f; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + accumExecTime = 0.f; + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + accumExecTime += StreamCompaction::Naive::scan(NPOT, c, a); + if (i == 0) accumExecTime = 0.f; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + accumExecTime = 0.f; + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + accumExecTime += StreamCompaction::Efficient::scan(SIZE, c, a); + if (i == 0) accumExecTime = 0.f; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); + accumExecTime = 0.f; + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + accumExecTime += StreamCompaction::Efficient::scan(NPOT, c, a); + if (i == 0) accumExecTime = 0.f; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("batch scan, power-of-two"); + accumExecTime = 0.f; + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + accumExecTime += StreamCompaction::Efficient4::scan( + SIZE, reinterpret_cast(c), reinterpret_cast(a)); + if (i == 0) accumExecTime = 0.f; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("batch scan, non-power-of-two"); + accumExecTime = 0.f; + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + accumExecTime += StreamCompaction::Efficient4::scan( + NPOT, reinterpret_cast(c), reinterpret_cast(a)); + if (i == 0) accumExecTime = 0.f; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); + accumExecTime = 0.f; + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + accumExecTime += StreamCompaction::Thrust::scan(SIZE, c, a); + if (i == 0) accumExecTime = 0.f; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); + accumExecTime = 0.f; + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + accumExecTime += StreamCompaction::Thrust::scan(NPOT, c, a); + if (i == 0) accumExecTime = 0.f; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -82,6 +255,8 @@ int main(int argc, char* argv[]) { printf("*****************************\n"); // Compaction tests + //clock_t start; + //clock_t end; genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; @@ -91,33 +266,103 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + start = clock(); + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + } + end = clock(); + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << double(end - start) / CLOCKS_PER_SEC * 1000 / TEST_GROUP_SIZE << "ms\n"; expectedCount = count; printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + start = clock(); + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + } + end = clock(); + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << double(end - start) / CLOCKS_PER_SEC * 1000 / TEST_GROUP_SIZE << "ms\n"; expectedNPOT = count; printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + start = clock(); + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + } + end = clock(); + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << double(end - start) / CLOCKS_PER_SEC * 1000 / TEST_GROUP_SIZE << "ms\n"; printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); + float et; + accumExecTime = 0.f; + zeroArray(SIZE, c); + printDesc("work-efficient compact, power-of-two"); + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + count = StreamCompaction::Efficient::compact(SIZE, c, a, &et); + if (i != 0) accumExecTime += et; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + accumExecTime = 0.f; + zeroArray(SIZE, c); + printDesc("work-efficient compact, non-power-of-two"); + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + count = StreamCompaction::Efficient::compact(NPOT, c, a, &et); + if (i != 0) accumExecTime += et; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + accumExecTime = 0.f; + zeroArray(SIZE, c); + printDesc("batch compact, power-of-two"); + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + count = StreamCompaction::Efficient4::compact( + SIZE, + reinterpret_cast(c), + reinterpret_cast(a), + &et); + if (i != 0) accumExecTime += et; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + accumExecTime = 0.f; + zeroArray(SIZE, c); + printDesc("batch compact, non-power-of-two"); + for (int i = 0; i < TEST_GROUP_SIZE; ++i) + { + count = StreamCompaction::Efficient4::compact( + NPOT, + reinterpret_cast(c), + reinterpret_cast(a), + &et); + if (i != 0) accumExecTime += et; + } + std::cout << " Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n"; + printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + + delete[] a; + delete[] b; + delete[] c; + + //system("pause"); + return 0; } diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index f6b572f..a7e8738 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -1,6 +1,9 @@ #pragma once #include +#include +#include + template int cmpArrays(int n, T *a, T *b) { @@ -59,3 +62,34 @@ void printArray(int n, int *a, bool abridged = false) { printf("]\n"); } +template +void testSorted(int n, const T *data, bool lessThan = true) +{ + if (n <= 0 || !data) + { + printf(" testSorted: Error: Invalid argument(s)\n"); + return; + } + + for (int i = 0; i < n - 1; ++i) + { + if ((lessThan && (data[i] > data[i + 1])) || + (!lessThan && (data[i] < data[i + 1]))) + { + printf(" Failed: Element %d %s Element %d\n", i, lessThan ? ">" : "<", i + 1); + return; + } + } + + printf(" passed\n"); +} + +void printGPUInfo(int device) +{ + cudaDeviceProp deviceProp; + + cudaGetDeviceProperties(&deviceProp, device); + + std::cout << '\n' + << deviceProp.name << " [sm_" << deviceProp.major << deviceProp.minor << "]\n"; +} diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..fcda682 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -9,9 +9,11 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radix_sort.h" + "radix_sort.cu" ) cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_30 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..eb0acc0 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,5 +1,8 @@ +#include +#include #include "common.h" + void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); if (cudaSuccess == err) { @@ -22,8 +25,13 @@ namespace Common { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ -__global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO +__global__ void kernMapToBoolean(int n, int *bools, const int *idata) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid >= n) return; + + bools[tid] = static_cast(idata[tid] != 0); } /** @@ -31,8 +39,17 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO + const int *idata, const int *bools, const int *indices) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid >= n) return; + + if (bools[tid]) + { + int idx = indices[tid]; + odata[idx] = idata[tid]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..3e32071 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -3,6 +3,7 @@ #include #include #include +#include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) @@ -31,5 +32,65 @@ namespace Common { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); + + class MyCudaTimer + { + public: + MyCudaTimer() : elapsedTimeMs(0.f), isActive(false) + { + cudaEventCreate(&begin); + cudaEventCreate(&end); + } + + virtual ~MyCudaTimer() + { + cudaEventDestroy(begin); + cudaEventDestroy(end); + } + + void start() + { + if (!isActive) + { + cudaEventRecord(begin); + isActive = true; + } + } + + void stop() + { + if (isActive) cudaEventRecord(end); + isActive = false; + } + + void restart() + { + stop(); + clear(); + start(); + } + + void clear() + { + cudaEventSynchronize(end); + elapsedTimeMs = 0.f; + } + + float duration() + { + float et; + + cudaEventSynchronize(end); + cudaEventElapsedTime(&et, begin, end); + elapsedTimeMs += et; + + return elapsedTimeMs; + } + + private: + cudaEvent_t begin, end; + float elapsedTimeMs; + bool isActive; + }; } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..d4ef883 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -8,8 +8,28 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + if (n <= 0 || !odata || !idata) + { + return; + } + + int pre, cur; + + for (int i = 0; i < n; ++i) + { + cur = idata[i]; + + if (i == 0) + { + odata[i] = 0; + } + else + { + odata[i] = pre + odata[i - 1]; + } + + pre = cur; + } } /** @@ -18,8 +38,22 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + if (n <= 0 || !odata || !idata) + { + return -1; + } + + const int *obegin = odata; + + for (int i = 0; i < n; ++i) + { + if (idata[i]) + { + *odata++ = idata[i]; + } + } + + return static_cast(odata - obegin); } /** @@ -28,8 +62,40 @@ int compactWithoutScan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + if (n <= 0 || !odata || !idata || odata == idata) + { + return -1; + } + + if (n == 1 && idata[0]) + { + odata[0] = idata[0]; + return 1; + } + + for (int i = 0; i < n; ++i) + { + odata[i] = static_cast(idata[i] != 0); + } + + scan(n, odata, odata); + + int odataSize = odata[n - 1]; + + for (int i = 0; i < n - 1; ++i) + { + if (odata[i] != odata[i + 1]) + { + odata[odata[i]] = idata[i]; + } + } + + if (idata[n - 1]) + { + odata[odataSize] = idata[n - 1]; + ++odataSize; + } + return odataSize; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..c67abe3 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -1,34 +1,674 @@ #include #include #include "common.h" +#define MEASURE_EXEC_TIME #include "efficient.h" +#include +#include + namespace StreamCompaction { -namespace Efficient { + namespace Efficient { -// TODO: __global__ +#ifdef USING_SHARED_MEMORY + __global__ void kernScan(int segSize, int * __restrict__ blockSums, int * __restrict__ odata, const int * __restrict__ idata) + { + extern __shared__ int temp[]; -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + const int base = blockIdx.x * segSize; + const int tid = threadIdx.x; + const int i1 = 2 * tid + 1; + const int i2 = 2 * tid + 2; + int offset = 1; + int ai, bi; -/** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ -int compact(int n, int *odata, const int *idata) { - // TODO - return -1; -} + // cache data + int gidx1 = base + tid; + int gidx2 = gidx1 + blockDim.x; + int lidx1 = tid + CONFLICT_FREE_OFFSET(tid); + int lidx2 = tid + (segSize >> 1) + CONFLICT_FREE_OFFSET(tid + (segSize >> 1)); + temp[lidx1] = idata[gidx1]; + temp[lidx2] = idata[gidx2]; -} + // up sweep + for (int d = segSize >> 1; d > 0; d >>= 1) + { + __syncthreads(); + + if (tid < d) + { + ai = offset * i1 - 1; + bi = offset * i2 - 1; + ai += CONFLICT_FREE_OFFSET(ai); + bi += CONFLICT_FREE_OFFSET(bi); + + temp[bi] += temp[ai]; + } + + offset *= 2; + } + + if (tid == 0) + { + int idx = segSize - 1 + CONFLICT_FREE_OFFSET(segSize - 1); + if (blockSums) blockSums[blockIdx.x] = temp[idx]; + temp[idx] = 0; + } + + // down sweep + for (int d = 1; d < segSize; d *= 2) + { + offset >>= 1; + __syncthreads(); + + if (tid < d) + { + ai = offset * i1 - 1; + bi = offset * i2 - 1; + ai += CONFLICT_FREE_OFFSET(ai); + bi += CONFLICT_FREE_OFFSET(bi); + + int t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + + __syncthreads(); + + odata[gidx1] = temp[lidx1]; + odata[gidx2] = temp[lidx2]; + } + + __global__ void kernPerSegmentAdd(int segSize, int * __restrict__ odata, const int * __restrict__ blockSums) + { + int bid = blockIdx.x; + int tid = threadIdx.x; + int writeIdx1 = bid * segSize + 2 * tid; + int writeIdx2 = writeIdx1 + 1; + + int sum = blockSums[bid]; + odata[writeIdx1] += sum; + odata[writeIdx2] += sum; + } + + void scanHelper(int segSize, int n, int *odata_dev, const int *idata_dev) + { + // determine segment size + int threadsPerBlock = segSize >> 1; + int numBlocks = NUM_SEG(n, segSize); // also numSegs + + int *iblockSums = 0, *oblockSums = 0; + int segSizeNextLevel; + if (numBlocks > 1) + { + segSizeNextLevel = computeSegmentSize(numBlocks); + size_t offsetInDW = alignedSize(numBlocks * segSize * sizeof(int), 256) >> 2; + iblockSums = const_cast(idata_dev + offsetInDW); + oblockSums = odata_dev + offsetInDW; + } + + kernScan << > >(segSize, iblockSums, odata_dev, idata_dev); + + if (numBlocks > 1) + { + scanHelper(segSizeNextLevel, numBlocks, oblockSums, iblockSums); + kernPerSegmentAdd << > >(segSize, odata_dev, oblockSums); + } + } +#else + __global__ void kernScanUpSweepOneLevel(int offset, int numActiveThreads, int *iodata) + { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid >= numActiveThreads) + { + return; + } + + if (numActiveThreads == 1) // last level + { + iodata[2 * offset - 1] = 0; + return; + } + + int i1 = 2 * tid + 1; + int i2 = i1 + 1; + int ai, bi; + + ai = offset * i1 - 1; + bi = offset * i2 - 1; + iodata[bi] += iodata[ai]; + } + + __global__ void kernScanDownSweepOneLevel(int offset, int numActiveThreads, int *iodata) + { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid >= numActiveThreads) + { + return; + } + + int i1 = 2 * tid + 1; + int i2 = i1 + 1; + int ai, bi; + + ai = offset * i1 - 1; + bi = offset * i2 - 1; + int t = iodata[ai]; + iodata[ai] = iodata[bi]; + iodata[bi] += t; + } +#endif + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +#ifdef MEASURE_EXEC_TIME + float scan(int n, int *odata, const int *idata) + { + if (n <= 0 || !odata || !idata || odata == idata) + { + return -1; + } +#else + void scan(int n, int *odata, const int *idata) + { + if (n <= 0 || !odata || !idata || odata == idata) + { + return; + } +#endif +#ifdef USING_SHARED_MEMORY + int segSize = computeSegmentSize(n); + const size_t kDevArraySizeInByte = computeActualMemSize(n); + int *odata_dev = 0; + int *idata_dev = 0; + + cudaMalloc(&odata_dev, kDevArraySizeInByte); + cudaMalloc(&idata_dev, kDevArraySizeInByte); + cudaMemset(idata_dev, 0, kDevArraySizeInByte); + cudaMemcpy(idata_dev, idata, n * sizeof(int), cudaMemcpyHostToDevice); + +#ifdef MEASURE_EXEC_TIME + float execTime = 0.f; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + + scanHelper(segSize, n, odata_dev, idata_dev); + +#ifdef MEASURE_EXEC_TIME + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&execTime, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); +#endif + + cudaMemcpy(odata, odata_dev, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(odata_dev); + cudaFree(idata_dev); + cudaDeviceSynchronize(); + +#ifdef MEASURE_EXEC_TIME + return execTime; +#endif +#else + const int paddedSize = nearestMultipleOfTwo(n); + const size_t kDevArraySizeInByte = paddedSize * sizeof(int); + int *iodata_dev = 0; + + cudaMalloc(&iodata_dev, kDevArraySizeInByte); + cudaMemset(iodata_dev, 0, kDevArraySizeInByte); + cudaMemcpy(iodata_dev, idata, n * sizeof(int), cudaMemcpyHostToDevice); + +#ifdef MEASURE_EXEC_TIME + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + + const int threadsPerBlock = 256; + const int numLevels = ilog2ceil(n); + int numActiveThreads = paddedSize >> 1; + int offset = 1; + + // up sweep + for (int i = 0; i < numLevels; ++i) + { + int numBlocks = (numActiveThreads + threadsPerBlock - 1) / threadsPerBlock; + kernScanUpSweepOneLevel << > >(offset, numActiveThreads, iodata_dev); + numActiveThreads >>= 1; + offset *= 2; + } + + // down sweep + numActiveThreads = 1; + for (int i = 0; i < numLevels; ++i) + { + offset >>= 1; + int numBlocks = (numActiveThreads + threadsPerBlock - 1) / threadsPerBlock; + kernScanDownSweepOneLevel << > >(offset, numActiveThreads, iodata_dev); + numActiveThreads <<= 1; + } + +#ifdef MEASURE_EXEC_TIME + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float millisceconds = 0; + cudaEventElapsedTime(&millisceconds, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); +#endif + + cudaMemcpy(odata, iodata_dev, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(iodata_dev); + cudaDeviceSynchronize(); + +#ifdef MEASURE_EXEC_TIME + return millisceconds; +#endif +#endif + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ +#ifdef MEASURE_EXEC_TIME + int compact(int n, int *odata, const int *idata, float *pExecTime) +#else + int compact(int n, int *odata, const int *idata) +#endif + { + if (n <= 0 || !odata || !idata || odata == idata) + { + return -1; + } + + using StreamCompaction::Common::kernMapToBoolean; + using StreamCompaction::Common::kernScatter; + +#ifdef MEASURE_EXEC_TIME + float &execTime = *pExecTime; + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); +#endif + + int *idata_dev = 0; + int *odata_dev = 0; + int *bools_dev = 0; + int *indices_dev = 0; + + int segSize = computeSegmentSize(n); + const size_t kBoolsSizeInByte = computeActualMemSize(n); + const size_t kIndicesSizeInByte = kBoolsSizeInByte; + + cudaMalloc(&idata_dev, n * sizeof(int)); + cudaMalloc(&bools_dev, kBoolsSizeInByte); + cudaMalloc(&indices_dev, kIndicesSizeInByte); + + cudaMemcpy(idata_dev, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemset(bools_dev, 0, kBoolsSizeInByte); + + const int threadsPerBlock = 256; + int numBlocks = (n + threadsPerBlock - 1) / threadsPerBlock; + +#ifdef MEASURE_EXEC_TIME + cudaEventRecord(start); + + kernMapToBoolean << > >(n, bools_dev, idata_dev); + + scanHelper(segSize, n, indices_dev, bools_dev); + + int numElemRemained; + cudaMemcpy(&numElemRemained, indices_dev + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + numElemRemained += idata[n - 1] ? 1 : 0; + cudaMalloc(&odata_dev, numElemRemained * sizeof(int)); + + kernScatter<<>>(n, odata_dev, idata_dev, bools_dev, indices_dev); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&execTime, start, stop); +#else + kernMapToBoolean << > >(n, bools_dev, idata_dev); + + scanHelper(segSize, n, indices_dev, bools_dev); + + int numElemRemained; + cudaMemcpy(&numElemRemained, indices_dev + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + numElemRemained += idata[n - 1] ? 1 : 0; + cudaMalloc(&odata_dev, numElemRemained * sizeof(int)); + + kernScatter << > >(n, odata_dev, idata_dev, bools_dev, indices_dev); +#endif + + cudaMemcpy(odata, odata_dev, numElemRemained * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(idata_dev); + cudaFree(odata_dev); + cudaFree(bools_dev); + cudaFree(indices_dev); + + return numElemRemained; + } + + } + + namespace Efficient4 + { + // Naive scan + __device__ unsigned scan1Inclusive(unsigned idata, volatile unsigned *s_data) + { + int pos = threadIdx.x; + int size = blockDim.x; + + // Pad 0's before to avoid pos >= offset branching + s_data[pos] = 0; + pos += size; + s_data[pos] = idata; + + for (int offset = 1; offset < size; offset <<= 1) + { + __syncthreads(); + unsigned t = s_data[pos - offset]; + __syncthreads(); + s_data[pos] += t; + } + + return s_data[pos]; + } + + __device__ unsigned scan1Exclusive(unsigned idata, volatile unsigned *s_data) + { + return scan1Inclusive(idata, s_data) - idata; + } + + __device__ uint4 scan4Inclusive(uint4 idata4, volatile unsigned *s_data) + { + // Perform inclusive prefix sum on the four elements + idata4.y += idata4.x; + idata4.z += idata4.y; + idata4.w += idata4.z; + + // Exclusively scan the 4-element sums on each block + unsigned oval = scan1Exclusive(idata4.w, s_data); + + // Accumulate results + idata4.x += oval; + idata4.y += oval; + idata4.z += oval; + idata4.w += oval; + + return idata4; + } + + __device__ uint4 scan4Exclusive(uint4 idata4, volatile unsigned *s_data) + { + uint4 odata4 = scan4Inclusive(idata4, s_data); + + // Make exclusive + odata4.x -= idata4.x; + odata4.y -= idata4.y; + odata4.z -= idata4.z; + odata4.w -= idata4.w; + + return odata4; + } + + /** + * Perform exclusive prefix sum on each thread block + * @n - size of d_dst and d_src + * @d_dst - destination device buffer + * @d_src - source device buffer + */ + __global__ void scanExclusiveShared(int n, uint4 *d_dst, uint4 *d_src) + { + extern __shared__ unsigned s_data[]; // size = 2 * blockDim.x + + int tid = blockDim.x * blockIdx.x + threadIdx.x; + + uint4 idata4; + if (tid < n) idata4 = d_src[tid]; + uint4 odata4 = scan4Exclusive(idata4, s_data); + if (tid < n) d_dst[tid] = odata4; + } + + /* + * @n - valid size of d_buf (not physical size) + * @preSegSize - segment size used in previous level scan + */ + __global__ void scanExclusiveShared2(int n, int preSegSize, + unsigned *d_obuf, unsigned *d_ibuf, unsigned *d_dst, unsigned *d_src) + { + extern __shared__ unsigned s_data[]; // size = 2 * blockDim.x + + int tid = blockDim.x * blockIdx.x + threadIdx.x; + + unsigned idata; + if (tid < n) + { + idata = d_dst[preSegSize - 1 + preSegSize * tid] + + d_src[preSegSize - 1 + preSegSize * tid]; + d_ibuf[tid] = idata; // so that result can be made inclusive again + } + + // Scan the sum of each block from last level scan + unsigned odata = scan1Exclusive(idata, s_data); + + if (tid < n) d_obuf[tid] = odata; + } + + /* + * @d_dst - size = blockDim.x * numBlocks + * @d_buf - size = numBlocks + */ + __global__ void perSegmentAdd(uint4 *d_dst, unsigned *d_buf) + { + __shared__ unsigned buf; + + int tid = blockDim.x * blockIdx.x + threadIdx.x; + + if (threadIdx.x == 0) buf = d_buf[blockIdx.x]; + + __syncthreads(); + + uint4 data4 = d_dst[tid]; + data4.x += buf; + data4.y += buf; + data4.z += buf; + data4.w += buf; + d_dst[tid] = data4; + } + + int computeSegmentSize(int n) + { + using Efficient::nearestMultipleOfTwo; + return n > (MAX_SEGMENT_SIZE >> 1) ? MAX_SEGMENT_SIZE : + std::max(nearestMultipleOfTwo(n), 4); + } + + size_t computeActualMemSize(int n) + { + using Efficient::alignedSize; + + const size_t kMemAlignmentInBytes = 256; // the alignment CUDA driver used + + int segSize = computeSegmentSize(n); + int numSegs = NUM_SEG(n, segSize); + size_t total = alignedSize(numSegs * segSize * sizeof(unsigned), kMemAlignmentInBytes); + + // Extra size to store intermediate results + while (numSegs > 1) + { + segSize = std::min(256, computeSegmentSize(numSegs)); + numSegs = NUM_SEG(numSegs, segSize); + size_t extra = alignedSize(numSegs * segSize * sizeof(unsigned), kMemAlignmentInBytes); + total += extra; + } + + return total; + } + + void scanHelper(int n, unsigned *d_odata, unsigned *d_idata) + { + using Efficient::alignedSize; + + int segSize = computeSegmentSize(n); + int numSegs = NUM_SEG(n, segSize); + int threadsPerBlock = segSize >> 2; + + scanExclusiveShared<<>>( + ROUND_SEG_SIZE(n, 4) >> 2, + reinterpret_cast(d_odata), + reinterpret_cast(d_idata)); + + std::vector st; + unsigned *d_dst = d_odata; + unsigned *d_src = d_idata; + + while (numSegs > 1) + { + st.push_back(numSegs); + st.push_back(segSize); + + size_t offsetInDW = alignedSize(numSegs * segSize * sizeof(unsigned), 256) / 4; + unsigned *d_ibuf = d_src + offsetInDW; + unsigned *d_obuf = d_dst + offsetInDW; + int n = numSegs; + int preSegSize = segSize; + + threadsPerBlock = segSize = std::min(256, computeSegmentSize(numSegs)); + numSegs = NUM_SEG(numSegs, segSize); + + scanExclusiveShared2<<>>( + n, preSegSize, d_obuf, d_ibuf, d_dst, d_src); + + d_src = d_ibuf; + d_dst = d_obuf; + } + + while (!st.empty()) + { + segSize = st.back(); + st.pop_back(); + numSegs = st.back(); + st.pop_back(); + + size_t offsetInDW = alignedSize(numSegs * segSize * sizeof(unsigned), 256) / 4; + unsigned *d_buf = d_dst; + d_dst -= offsetInDW; + + perSegmentAdd<<> 2)>>>(reinterpret_cast(d_dst), d_buf); + } + } + + float scan(int n, unsigned *odata, const unsigned *idata) + { + if (n <= 0 || !odata || !idata) + { + throw std::exception("StreamCompaction::Efficient4::scan"); + } + + Common::MyCudaTimer timer; + + size_t kDevBuffSize = computeActualMemSize(n); + unsigned *d_idata, *d_odata; + + cudaMalloc(&d_idata, kDevBuffSize); + cudaMalloc(&d_odata, kDevBuffSize); + cudaMemset(d_idata, 0, kDevBuffSize); + //cudaMemset(d_odata, 0, kDevBuffSize); + cudaMemcpy(d_idata, idata, n * sizeof(unsigned), cudaMemcpyHostToDevice); + + timer.start(); + + scanHelper(n, d_odata, d_idata); + + timer.stop(); + + cudaMemcpy(odata, d_odata, n * sizeof(unsigned), cudaMemcpyDeviceToHost); + cudaFree(d_idata); + cudaFree(d_odata); + + return timer.duration(); + } + + + int compact(int n, unsigned *odata, const unsigned *idata, float *pExecTime) + { + if (n <= 0 || !odata || !idata || odata == idata) + { + throw std::exception("StreamCompaction::Efficient4::compact"); + } + + using StreamCompaction::Common::kernMapToBoolean; + using StreamCompaction::Common::kernScatter; + + Common::MyCudaTimer timer; + + unsigned *idata_dev = 0; + unsigned *odata_dev = 0; + unsigned *bools_dev = 0; + unsigned *indices_dev = 0; + + int segSize = computeSegmentSize(n); + const size_t kBoolsSizeInByte = computeActualMemSize(n); + const size_t kIndicesSizeInByte = kBoolsSizeInByte; + + cudaMalloc(&idata_dev, n * sizeof(unsigned)); + cudaMalloc(&bools_dev, kBoolsSizeInByte); + cudaMalloc(&indices_dev, kIndicesSizeInByte); + + cudaMemcpy(idata_dev, idata, n * sizeof(unsigned), cudaMemcpyHostToDevice); + cudaMemset(bools_dev, 0, kBoolsSizeInByte); + + const int threadsPerBlock = 256; + int numBlocks = (n + threadsPerBlock - 1) / threadsPerBlock; + + timer.start(); + + kernMapToBoolean<<>>( + n, + reinterpret_cast(bools_dev), + reinterpret_cast(idata_dev)); + + scanHelper(n, indices_dev, bools_dev); + + int numElemRemained; + cudaMemcpy(&numElemRemained, indices_dev + (n - 1), sizeof(unsigned), cudaMemcpyDeviceToHost); + numElemRemained += idata[n - 1] ? 1 : 0; + cudaMalloc(&odata_dev, numElemRemained * sizeof(unsigned)); + + kernScatter<<>>( + n, + reinterpret_cast(odata_dev), + reinterpret_cast(idata_dev), + reinterpret_cast(bools_dev), + reinterpret_cast(indices_dev)); + + timer.stop(); + *pExecTime = timer.duration(); + + cudaMemcpy(odata, odata_dev, numElemRemained * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(idata_dev); + cudaFree(odata_dev); + cudaFree(bools_dev); + cudaFree(indices_dev); + + return numElemRemained; + } + } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..6de9abf 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -1,9 +1,92 @@ #pragma once -namespace StreamCompaction { -namespace Efficient { - void scan(int n, int *odata, const int *idata); +#define USING_SHARED_MEMORY - int compact(int n, int *odata, const int *idata); -} +#ifdef USING_SHARED_MEMORY +#define MAX_SEGMENT_SIZE 1024 +#define NUM_SEG(x, ss) (((x) + (ss) - 1) / (ss)) +#define ROUND_SEG_SIZE(x, ss) (NUM_SEG(x, (ss)) * (ss)) + +#define NUM_BANKS 32 +#define LOG_NUM_BANKS 5 +#define CONFLICT_FREE_OFFSET(x) ((x) >> LOG_NUM_BANKS) +#endif + +namespace StreamCompaction +{ + namespace Efficient + { +#ifdef MEASURE_EXEC_TIME + float scan(int n, int *odata, const int *idata); + + int compact(int n, int *odata, const int *idata, float *pExecTime); +#else + void scan(int n, int *odata, const int *idata); + + int compact(int n, int *odata, const int *idata); +#endif + + void scanHelper(int segSize, int n, int *odata_dev, const int *idata_dev); + + inline int nearestMultipleOfTwo(int n) + { + int result = 1; + while (result < n) result <<= 1; + return result; + } + + inline int computeSegmentSize(int n) + { + return n > (MAX_SEGMENT_SIZE >> 1) ? MAX_SEGMENT_SIZE : nearestMultipleOfTwo(n); + } + + inline size_t alignedSize(size_t sizeInBytes, size_t alignmentInBytes) + { + return (sizeInBytes + alignmentInBytes - 1) / alignmentInBytes * alignmentInBytes; + } + + /** + * When data size is large, multiple passes are necessary to compute the + * final result and extra device memory is required to store intermediate + * results. + * + * Use the result from this function to cudaMalloc the input and output + * buffers passed to scanHelper which assumes that I/O device buffers have + * sufficient size. + * + * Assuming address start at a 256-byte boundary + */ + template + inline size_t computeActualMemSize(int n) + { + const size_t kMemAlignmentInBytes = 256; // the alignment CUDA driver used + + int segSize = computeSegmentSize(n); + int numSegs = NUM_SEG(n, segSize); + size_t total = alignedSize(numSegs * segSize * sizeof(T), kMemAlignmentInBytes); + + while (numSegs > 1) + { + segSize = computeSegmentSize(numSegs); + numSegs = NUM_SEG(numSegs, segSize); + size_t extra = alignedSize(numSegs * segSize * sizeof(T), kMemAlignmentInBytes); + total += extra; + } + + return total; + } + } + + namespace Efficient4 + { + float scan(int n, unsigned *odata, const unsigned *idata); + + int compact(int n, unsigned *odata, const unsigned *idata, float *pExecTime); + + int computeSegmentSize(int n); + + size_t computeActualMemSize(int n); + + void scanHelper(int n, unsigned *d_odata, unsigned *d_idata); + } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..5c301ff 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,20 +1,103 @@ #include #include #include "common.h" +#define MEASURE_EXEC_TIME #include "naive.h" namespace StreamCompaction { -namespace Naive { + namespace Naive { -// TODO: __global__ + __global__ void kernScanOneLevel(int stride, int n, int * __restrict__ odata, const int * __restrict__ idata) + { + unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + if (tid >= n) return; -} + if (tid >= stride) + { + odata[tid] = idata[tid - stride] + idata[tid]; + } + else + { + odata[tid] = idata[tid]; + } + } + + __global__ void makeExclusive(int n, int * __restrict__ odata, int * __restrict__ idata) + { + unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid >= n) return; + + odata[tid] = tid == 0 ? 0 : idata[tid - 1]; + } + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +#ifdef MEASURE_EXEC_TIME + float scan(int n, int *odata, const int *idata) + { + if (n <= 0 || !odata || !idata || odata == idata) + { + return -1; + } +#else + void scan(int n, int *odata, const int *idata) + { + if (n <= 0 || !odata || !idata || odata == idata) + { + return; + } +#endif + const size_t kArraySizeInByte = n * sizeof(int); + int *idata_dev = nullptr, *odata_dev = nullptr; + + cudaMalloc(&idata_dev, kArraySizeInByte); + cudaMalloc(&odata_dev, kArraySizeInByte); + cudaMemcpy(idata_dev, idata, kArraySizeInByte, cudaMemcpyHostToDevice); + +#ifdef MEASURE_EXEC_TIME + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + + const int numLevels = ilog2ceil(n); + const int threadsPerBlock = 256; + const int numBlocks = (n + threadsPerBlock - 1) / threadsPerBlock; + int stride = 1; + + for (int i = 0; i < numLevels; ++i) + { + kernScanOneLevel << > >(stride, n, odata_dev, idata_dev); + stride <<= 1; + //swap + int *tmp = odata_dev; + odata_dev = idata_dev; + idata_dev = tmp; + } + makeExclusive << > >(n, odata_dev, idata_dev); + +#ifdef MEASURE_EXEC_TIME + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float millisceconds = 0; + cudaEventElapsedTime(&millisceconds, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); +#endif + + cudaMemcpy(odata, odata_dev, kArraySizeInByte, cudaMemcpyDeviceToHost); + cudaFree(idata_dev); + cudaFree(odata_dev); + cudaDeviceSynchronize(); // make sure result is ready + +#ifdef MEASURE_EXEC_TIME + return millisceconds; +#endif + } + + } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..3ab5364 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,6 +2,10 @@ namespace StreamCompaction { namespace Naive { - void scan(int n, int *odata, const int *idata); +#ifdef MEASURE_EXEC_TIME + float scan(int n, int *odata, const int *idata); +#else + void scan(int n, int *odata, const int *idata); +#endif } } diff --git a/stream_compaction/radix_sort.cu b/stream_compaction/radix_sort.cu new file mode 100644 index 0000000..7cc94b1 --- /dev/null +++ b/stream_compaction/radix_sort.cu @@ -0,0 +1,201 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#define MEASURE_EXEC_TIME +#include +#include "radix_sort.h" + + +namespace std +{ + class InvalidArgument : public exception + { + public: + virtual const char *what() const throw() + { + return "One or more invalid arguments detected"; + } + }; +} + + +namespace ParallelRadixSort +{ + template + __global__ void kernClassify(uint32_t n, T mask, + uint32_t * __restrict__ notbools, uint32_t * __restrict__ bools, const T * __restrict__ idata) + { + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid >= n) return; + + uint32_t t = static_cast((idata[tid] & mask) == 0); + notbools[tid] = t; + bools[tid] = t ^ 0x1; + } + + template + __global__ void kernScatter(uint32_t n, + uint32_t * __restrict__ nobools, uint32_t * __restrict__ noindices, uint32_t * __restrict__ yesindices, + T * __restrict__ odata, const T * __restrict__ idata) + { + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid >= n) return; + + uint32_t isBitZero = nobools[tid]; + uint32_t idx; + + if (isBitZero) idx = noindices[tid]; else idx = yesindices[tid]; + + odata[idx] = idata[tid]; + } + +#ifdef MEASURE_EXEC_TIME + template + float sort(int n, T *odata, const T *idata, T bitMask, bool lsb) +#else + template + void sort(int n, T *odata, const T *idata, T bitMask, bool lsb) +#endif + { + if (n <= 0 || !odata || !idata) + { + throw std::InvalidArgument(); + } + + int segSize = StreamCompaction::Efficient4::computeSegmentSize(2 * n); + const size_t kDevArraySizeInByte = StreamCompaction::Efficient4::computeActualMemSize(2 * n); + + T *idata_dev = 0; + T *odata_dev = 0; + uint32_t *noyes_bools_dev = 0; + uint32_t *indices_dev = 0; + + cudaMalloc(&idata_dev, n * sizeof(T)); + cudaMalloc(&odata_dev, n * sizeof(T)); + cudaMalloc(&noyes_bools_dev, kDevArraySizeInByte); + cudaMalloc(&indices_dev, kDevArraySizeInByte); + + cudaMemcpy(idata_dev, idata, n * sizeof(T), cudaMemcpyHostToDevice); + + const int threadsPerBlock = 256; + int numBlocks = (n + threadsPerBlock - 1) / threadsPerBlock; + int numBits = 8 * sizeof(T); + T mask = lsb ? 1 : (1 << (numBits - 1)); + +#ifdef MEASURE_EXEC_TIME + float execTime = 0.f; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + + for (int i = 0; i < numBits; ++i) + { + if (!(bitMask & mask)) continue; // do not consider this bit + + kernClassify<<>>(n, mask, noyes_bools_dev, noyes_bools_dev + n, idata_dev); + + StreamCompaction::Efficient4::scanHelper(2 * n, indices_dev, noyes_bools_dev); + + kernScatter<<>>(n, noyes_bools_dev, indices_dev, indices_dev + n, odata_dev, idata_dev); + + if (lsb) mask <<= 1; else mask >>= 1; + + T *tmp = odata_dev; + odata_dev = idata_dev; + idata_dev = tmp; + } + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&execTime, start, stop); + + cudaEventDestroy(start); + cudaEventDestroy(stop); +#else + for (int i = 0; i < numBits; ++i) + { + if (!(bitMask & mask)) continue; // do not consider this bit + + kernClassify << > >(n, mask, noyes_bools_dev, noyes_bools_dev + n, idata_dev); + + StreamCompaction::Efficient::scanHelper(segSize, 2 * n, indices_dev, noyes_bools_dev); + + kernScatter << > >(n, noyes_bools_dev, indices_dev, indices_dev + n, odata_dev, idata_dev); + + if (lsb) mask <<= 1; else mask >>= 1; + + T *tmp = odata_dev; + odata_dev = idata_dev; + idata_dev = tmp; + } +#endif + + cudaMemcpy(odata, idata_dev, n * sizeof(T), cudaMemcpyDeviceToHost); + cudaFree(idata_dev); + cudaFree(odata_dev); + cudaFree(noyes_bools_dev); + cudaFree(indices_dev); + +#ifdef MEASURE_EXEC_TIME + return execTime; +#endif + } + + // Since template definition is not visible to users (main.obj in this case), + // we need to explicitly tell the compiler to generate all the template implementations + // that will be used later +#ifdef MEASURE_EXEC_TIME + template float sort(int n, uint32_t *odata, const uint32_t *idata, uint32_t bitMask, bool lsb); +#else + template void sort(int n, uint32_t *odata, const uint32_t *idata, uint32_t bitMask, bool lsb); +#endif + +#ifdef MEASURE_EXEC_TIME + template + float thrustSort(int n, T *odata, const T *idata) + { + if (n <= 0 || !odata || !idata) + { + throw std::InvalidArgument(); + } + + T *iodata_dev = 0; + + cudaMalloc(&iodata_dev, n * sizeof(T)); + cudaMemcpy(iodata_dev, idata, n * sizeof(T), cudaMemcpyHostToDevice); + + thrust::device_ptr thrust_iodata(iodata_dev); + + float execTime; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + + thrust::stable_sort(thrust_iodata, thrust_iodata + n); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&execTime, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); + + cudaMemcpy(odata, iodata_dev, n * sizeof(T), cudaMemcpyDeviceToHost); + cudaFree(iodata_dev); + + return execTime; + } + + template float thrustSort(int n, uint32_t *odata, const uint32_t *idata); +#endif +} \ No newline at end of file diff --git a/stream_compaction/radix_sort.h b/stream_compaction/radix_sort.h new file mode 100644 index 0000000..8eea74d --- /dev/null +++ b/stream_compaction/radix_sort.h @@ -0,0 +1,19 @@ +#pragma once + +#include + + +namespace ParallelRadixSort +{ + // Only work on unsigned integral types +#ifdef MEASURE_EXEC_TIME + template + float sort(int n, T *odata, const T *idata, T bitMask, bool lsb = true); + + template + float thrustSort(int n, T *odata, const T *idata); +#else + template + void sort(int n, T *odata, const T *idata, T bitMask, bool lsb = true); +#endif +} \ No newline at end of file diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..4c9cbaa 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -4,19 +4,70 @@ #include #include #include "common.h" +#define MEASURE_EXEC_TIME #include "thrust.h" namespace StreamCompaction { -namespace Thrust { - -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); -} + namespace Thrust { -} + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +#ifdef MEASURE_EXEC_TIME + float scan(int n, int *odata, const int *idata) + { + if (n <= 0 || !odata || !idata || odata == idata) + { + return -1; + } +#else + void scan(int n, int *odata, const int *idata) + { + if (n <= 0 || !odata || !idata || odata == idata) + { + return; + } +#endif + // TODO use `thrust::exclusive_scan` + // example: for device_vectors dv_in and dv_out: + // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + const size_t kArraySizeInByte = n * sizeof(int); + int *idata_dev = nullptr, *odata_dev = nullptr; + + cudaMalloc(&idata_dev, kArraySizeInByte); + cudaMalloc(&odata_dev, kArraySizeInByte); + cudaMemcpy(idata_dev, idata, kArraySizeInByte, cudaMemcpyHostToDevice); + + thrust::device_ptr thrust_idata(idata_dev); + thrust::device_ptr thrust_odata(odata_dev); + +#ifdef MEASURE_EXEC_TIME + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); +#endif + + thrust::exclusive_scan(thrust_idata, thrust_idata + n, thrust_odata); + +#ifdef MEASURE_EXEC_TIME + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float millisceconds = 0; + cudaEventElapsedTime(&millisceconds, start, stop); + cudaEventDestroy(start); + cudaEventDestroy(stop); +#endif + + cudaMemcpy(odata, odata_dev, kArraySizeInByte, cudaMemcpyDeviceToHost); + cudaFree(idata_dev); + cudaFree(odata_dev); + cudaDeviceSynchronize(); // make sure result is ready + +#ifdef MEASURE_EXEC_TIME + return millisceconds; +#endif + } + + } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index 06707f3..baf6d97 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -2,6 +2,10 @@ namespace StreamCompaction { namespace Thrust { - void scan(int n, int *odata, const int *idata); +#ifdef MEASURE_EXEC_TIME + float scan(int n, int *odata, const int *idata); +#else + void scan(int n, int *odata, const int *idata); +#endif } }