From 0ee1efefbfc0e35967b7f331c8337e17945108fa Mon Sep 17 00:00:00 2001 From: Trung Le Date: Tue, 20 Sep 2016 11:16:54 -0400 Subject: [PATCH 01/15] Implemented CPU scan and stream compaction --- stream_compaction/cpu.cu | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..0b46006 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,5 +1,6 @@ #include #include "cpu.h" +#include namespace StreamCompaction { namespace CPU { @@ -9,7 +10,10 @@ namespace CPU { */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + odata[0] = 0; + for (int i = 1; i < n; ++i) { + odata[i] = odata[i - 1] + idata[i - 1]; + } } /** @@ -19,7 +23,13 @@ void scan(int n, int *odata, const int *idata) { */ int compactWithoutScan(int n, int *odata, const int *idata) { // TODO - return -1; + int oIndex = 0; + for (int iIndex = 0; iIndex < n; ++iIndex) { + if (idata[iIndex] != 0) { + odata[oIndex++] = idata[iIndex]; + } + } + return oIndex; } /** @@ -29,7 +39,25 @@ int compactWithoutScan(int n, int *odata, const int *idata) { */ int compactWithScan(int n, int *odata, const int *idata) { // TODO - return -1; + memset(odata, 0, n); + for (int i = 0; i < n; ++i) { + if (idata[i] != 0) { + odata[i] = 1; + } + } + int* scanResult = new int[n]; + scan(n, scanResult, odata); + + int remainingNumberOfElements = 0; + for (int i = 0; i < n; ++i) { + if (odata[i] == 1) { + odata[scanResult[i]] = idata[i]; + remainingNumberOfElements = scanResult[i] + 1; + } + } + + delete[] scanResult; + return remainingNumberOfElements; } } From 32b257f22a051d06d0d97b52da45cab12fdc49bc Mon Sep 17 00:00:00 2001 From: Trung Le Date: Tue, 20 Sep 2016 14:14:51 -0400 Subject: [PATCH 02/15] Implemented naive GPU scan --- stream_compaction/naive.cu | 86 +++++++++++++++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..7a7e33d 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,17 +3,101 @@ #include "common.h" #include "naive.h" +//#define DEBUG +#define BLOCK_SIZE 128 +#define BLOCK_COUNT(n) ((n + BLOCK_SIZE - 1) / BLOCK_SIZE) + namespace StreamCompaction { namespace Naive { // TODO: __global__ +__global__ void inclusiveToExclusiveScan(int n, int* odata, const int* idata) { + int tid = threadIdx.x + (blockIdx.x * blockDim.x); + if (tid >= n) { + return; + } + + if (tid == 0) { + odata[0] = 0; + return; + } + + odata[tid] = idata[tid - 1]; +} + +__global__ void naiveScan(int n, int offset, int* odata, const int *idata) { + int tid = threadIdx.x + (blockIdx.x * blockDim.x); + if (tid >= n) { + return; + } + if (tid >= offset) { + odata[tid] = idata[tid - offset] + idata[tid] ; + } else { + odata[tid] = idata[tid]; + } +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + + int* dev_odata1; + int* dev_odata2; + + cudaMalloc((void**)&dev_odata1, n * sizeof(int)); + cudaMalloc((void**)&dev_odata2, n * sizeof(int)); + + cudaMemcpy(dev_odata1, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata2, idata, n * sizeof(int), cudaMemcpyHostToDevice); +#ifdef DEBUG + int* temp = new int[5]; +#endif + + int height = ilog2ceil(n); + //height = 2; + for (int level = 1; level <= height; ++level) { + int offset = pow(2, level - 1); + naiveScan << > >( + n, + offset, + (level % 2) == 0 ? dev_odata1 : dev_odata2, + (level % 2) == 0 ? dev_odata2 : dev_odata1 + ); + +#ifdef DEBUG + printf("----\n"); + cudaMemcpy(temp, dev_odata1, 5 * sizeof(int), cudaMemcpyDeviceToHost); + for (int j = 0; j < 5; ++j) { + printf("offset: %d, odata1[k]: %d\n", offset, temp[j]); + } + printf("\n"); + cudaMemcpy(temp, dev_odata2, 5 * sizeof(int), cudaMemcpyDeviceToHost); + for (int j = 0; j < 5; ++j) { + printf("offset: %d, odata2[k]: %d\n", offset, temp[j]); + } +#endif + + } + + if (height % 2 == 0) { + inclusiveToExclusiveScan<<>>(n, dev_odata2, dev_odata1); + cudaMemcpy(odata, dev_odata2, n * sizeof(int), cudaMemcpyDeviceToHost); + } else { + inclusiveToExclusiveScan << > >(n, dev_odata1, dev_odata2); + cudaMemcpy(odata, dev_odata1, n * sizeof(int), cudaMemcpyDeviceToHost); + } + + odata[0] = 0; + +#ifdef DEBUG + delete[] temp; +#endif + + cudaFree(dev_odata1); + cudaFree(dev_odata2); } } From af6eccc9b3fbcbc62270c7bab3d30dc0424e8d82 Mon Sep 17 00:00:00 2001 From: Trung Le Date: Tue, 20 Sep 2016 20:15:14 -0400 Subject: [PATCH 03/15] Implemented work-efficient scan --- stream_compaction/efficient.cu | 54 +++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..5457cd0 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,17 +3,69 @@ #include "common.h" #include "efficient.h" +#define BLOCK_SIZE 128 +#define BLOCK_COUNT(n) ((n + BLOCK_SIZE - 1) / BLOCK_SIZE) + namespace StreamCompaction { namespace Efficient { // TODO: __global__ +__global__ void upsweep(int n, int level, int* odata) { + int tid = threadIdx.x + (blockIdx.x * blockDim.x); + if (tid >= n) { + return; + } + + int twoToLevel = powf(2, level); + int twoToLevelPlusOne = powf(2, level + 1); + if (tid % twoToLevelPlusOne == 0) { + odata[tid + twoToLevelPlusOne - 1] += odata[tid + twoToLevel - 1]; + } +} + +__global__ void downsweep(int n, int level, int* odata) { + int tid = threadIdx.x + (blockIdx.x * blockDim.x); + if (tid >= n) { + return; + } + int twoToLevel = powf(2, level); + int twoToLevelPlusOne = powf(2, level + 1); + + if (tid % twoToLevelPlusOne == 0) { + int t = odata[tid + twoToLevel - 1]; + odata[tid + twoToLevel - 1] = odata[tid + twoToLevelPlusOne - 1]; + odata[tid + twoToLevelPlusOne - 1] += t; + } +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + int* dev_odata; + + int height = ilog2ceil(n); + + int ceilPower2 = pow(2, height); + cudaMalloc((void**)&dev_odata, ceilPower2 * sizeof(int)); + cudaMemset(dev_odata, 0, ceilPower2 * sizeof(int)); + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + for (int level = 0; level < height; ++level) { + upsweep << > >(ceilPower2, level, dev_odata); + } + + cudaMemset(dev_odata + (ceilPower2 - 1), 0, sizeof(int)); + + for (int level = height - 1; level >= 0; --level) { + downsweep << > >(ceilPower2, level, dev_odata); + } + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_odata); } /** From 6b1fd30b0a35a38ff3215b99432b9a66ae204c3d Mon Sep 17 00:00:00 2001 From: Trung Le Date: Tue, 20 Sep 2016 20:16:39 -0400 Subject: [PATCH 04/15] Implemen thrust scan --- stream_compaction/thrust.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..16bbd17 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -16,6 +16,14 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + // Convert to device vector + thrust::device_vector dev_idata(idata, idata + n); + thrust::device_vector dev_odata(odata, odata + n); + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + + thrust::host_vector host_odata = dev_odata; + cudaMemcpy(odata, host_odata.data(), n * sizeof(int), cudaMemcpyHostToHost); } } From 6d83f44d01fbc2fd7ff2d48f3ebf4313532a75eb Mon Sep 17 00:00:00 2001 From: Trung Le Date: Tue, 20 Sep 2016 20:31:58 -0400 Subject: [PATCH 05/15] Moved inclusive to exclusive scan result conversion kernel to common.cu --- stream_compaction/common.cu | 20 ++++++++++++++++++ stream_compaction/common.h | 2 ++ stream_compaction/naive.cu | 42 ++----------------------------------- 3 files changed, 24 insertions(+), 40 deletions(-) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..2026069 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -18,6 +18,26 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { namespace StreamCompaction { namespace Common { + /** + * Convert an inclusice scan result to an exclusive scan result + * + */ +__global__ void inclusiveToExclusiveScanResult(int n, int* odata, const int* idata) { + int tid = threadIdx.x + (blockIdx.x * blockDim.x); + if (tid >= n) { + return; + } + + if (tid == 0) { + odata[0] = 0; + return; + } + + odata[tid] = idata[tid - 1]; +} + + + /** * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..7b55c66 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -27,6 +27,8 @@ inline int ilog2ceil(int x) { namespace StreamCompaction { namespace Common { + __global__ void inclusiveToExclusiveScanResult(int n, int* odata, const int* idata); + __global__ void kernMapToBoolean(int n, int *bools, const int *idata); __global__ void kernScatter(int n, int *odata, diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 7a7e33d..3d9b7d0 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,7 +3,6 @@ #include "common.h" #include "naive.h" -//#define DEBUG #define BLOCK_SIZE 128 #define BLOCK_COUNT(n) ((n + BLOCK_SIZE - 1) / BLOCK_SIZE) @@ -12,20 +11,6 @@ namespace Naive { // TODO: __global__ -__global__ void inclusiveToExclusiveScan(int n, int* odata, const int* idata) { - int tid = threadIdx.x + (blockIdx.x * blockDim.x); - if (tid >= n) { - return; - } - - if (tid == 0) { - odata[0] = 0; - return; - } - - odata[tid] = idata[tid - 1]; -} - __global__ void naiveScan(int n, int offset, int* odata, const int *idata) { int tid = threadIdx.x + (blockIdx.x * blockDim.x); if (tid >= n) { @@ -52,9 +37,6 @@ void scan(int n, int *odata, const int *idata) { cudaMemcpy(dev_odata1, idata, n * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_odata2, idata, n * sizeof(int), cudaMemcpyHostToDevice); -#ifdef DEBUG - int* temp = new int[5]; -#endif int height = ilog2ceil(n); //height = 2; @@ -66,36 +48,16 @@ void scan(int n, int *odata, const int *idata) { (level % 2) == 0 ? dev_odata1 : dev_odata2, (level % 2) == 0 ? dev_odata2 : dev_odata1 ); - -#ifdef DEBUG - printf("----\n"); - cudaMemcpy(temp, dev_odata1, 5 * sizeof(int), cudaMemcpyDeviceToHost); - for (int j = 0; j < 5; ++j) { - printf("offset: %d, odata1[k]: %d\n", offset, temp[j]); - } - printf("\n"); - cudaMemcpy(temp, dev_odata2, 5 * sizeof(int), cudaMemcpyDeviceToHost); - for (int j = 0; j < 5; ++j) { - printf("offset: %d, odata2[k]: %d\n", offset, temp[j]); - } -#endif - } if (height % 2 == 0) { - inclusiveToExclusiveScan<<>>(n, dev_odata2, dev_odata1); + Common::inclusiveToExclusiveScanResult << > >(n, dev_odata2, dev_odata1); cudaMemcpy(odata, dev_odata2, n * sizeof(int), cudaMemcpyDeviceToHost); } else { - inclusiveToExclusiveScan << > >(n, dev_odata1, dev_odata2); + Common::inclusiveToExclusiveScanResult << > >(n, dev_odata1, dev_odata2); cudaMemcpy(odata, dev_odata1, n * sizeof(int), cudaMemcpyDeviceToHost); } - odata[0] = 0; - -#ifdef DEBUG - delete[] temp; -#endif - cudaFree(dev_odata1); cudaFree(dev_odata2); } From d7c2ca54eb59e607679eea2e8153a1d9a561eba3 Mon Sep 17 00:00:00 2001 From: Trung Le Date: Tue, 20 Sep 2016 20:38:37 -0400 Subject: [PATCH 06/15] Moved #define BLOCK_SIZE and BLOCK_COUNT to common.h --- stream_compaction/common.h | 4 ++++ stream_compaction/efficient.cu | 3 --- stream_compaction/naive.cu | 3 --- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 7b55c66..cdeee04 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -7,6 +7,10 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define BLOCK_SIZE 128 +#define BLOCK_COUNT(n) ((n + BLOCK_SIZE - 1) / BLOCK_SIZE) + + /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 5457cd0..0e7f5b8 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,9 +3,6 @@ #include "common.h" #include "efficient.h" -#define BLOCK_SIZE 128 -#define BLOCK_COUNT(n) ((n + BLOCK_SIZE - 1) / BLOCK_SIZE) - namespace StreamCompaction { namespace Efficient { diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d9b7d0..3d29d2c 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,9 +3,6 @@ #include "common.h" #include "naive.h" -#define BLOCK_SIZE 128 -#define BLOCK_COUNT(n) ((n + BLOCK_SIZE - 1) / BLOCK_SIZE) - namespace StreamCompaction { namespace Naive { From bb454063b98ea5f6981ca74bd825c2bbc6b6d6b0 Mon Sep 17 00:00:00 2001 From: Trung Le Date: Fri, 23 Sep 2016 21:44:46 -0400 Subject: [PATCH 07/15] Cleaned up codes for work-efficient stream compaction --- stream_compaction/common.cu | 43 +++++++++++++++- stream_compaction/common.h | 3 ++ stream_compaction/efficient.cu | 90 +++++++++++++++++++++++++++------- 3 files changed, 118 insertions(+), 18 deletions(-) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2026069..cf8450a 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -14,6 +14,34 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { exit(EXIT_FAILURE); } +/* Max reduction is really just the partial sum upsweep algorithm */ +__global__ void maxReduction(int n, int level, int* odata) { + int tid = threadIdx.x + (blockIdx.x * blockDim.x); + if (tid >= n) { + return; + } + + int twoToLevel = powf(2, level); + int twoToLevelPlusOne = powf(2, level + 1); + if (tid % twoToLevelPlusOne == 0) { + odata[tid + twoToLevelPlusOne - 1] = imax(odata[tid + twoToLevel - 1], odata[tid + twoToLevelPlusOne - 1]); + } +} + +int findMaxInDeviceArray(int n, int *dev_idata) { + + int height = ilog2ceil(n); + + + for (int level = 0; level < height; ++level) { + maxReduction << > >(n, level, dev_idata); + } + + int maxValue = 0; + cudaMemcpy(&maxValue, dev_idata + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + return maxValue; +} namespace StreamCompaction { namespace Common { @@ -43,7 +71,12 @@ __global__ void inclusiveToExclusiveScanResult(int n, int* odata, const int* ida * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int tid = threadIdx.x + blockDim.x * blockIdx.x; + if (tid >= n) { + return; + } + + bools[tid] = (bool)idata[tid]; } /** @@ -53,6 +86,14 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int tid = threadIdx.x + blockDim.x * blockIdx.x; + if (tid >= n) { + return; + } + + if (bools[tid] == 1) { + odata[indices[tid]] = idata[tid]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index cdeee04..35fb581 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -10,6 +10,7 @@ #define BLOCK_SIZE 128 #define BLOCK_COUNT(n) ((n + BLOCK_SIZE - 1) / BLOCK_SIZE) +#define imax(a, b) (((a) > (b)) ? (a) : (b)) /** * Check for CUDA errors; print and exit if there was a problem. @@ -28,6 +29,8 @@ inline int ilog2ceil(int x) { return ilog2(x - 1) + 1; } +int findMaxInDeviceArray(int n, int *idata); + namespace StreamCompaction { namespace Common { diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 0e7f5b8..e0ccc15 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -14,8 +14,8 @@ __global__ void upsweep(int n, int level, int* odata) { return; } - int twoToLevel = powf(2, level); - int twoToLevelPlusOne = powf(2, level + 1); + int twoToLevel = 1 << level; + int twoToLevelPlusOne = 1 << (level + 1); if (tid % twoToLevelPlusOne == 0) { odata[tid + twoToLevelPlusOne - 1] += odata[tid + twoToLevel - 1]; } @@ -26,8 +26,8 @@ __global__ void downsweep(int n, int level, int* odata) { if (tid >= n) { return; } - int twoToLevel = powf(2, level); - int twoToLevelPlusOne = powf(2, level + 1); + int twoToLevel = 1 << level; + int twoToLevelPlusOne = 1 << (level + 1); if (tid % twoToLevelPlusOne == 0) { int t = odata[tid + twoToLevel - 1]; @@ -36,30 +36,50 @@ __global__ void downsweep(int n, int level, int* odata) { } } +// Should only be launched with 1 thread? +__global__ void remainingElementsCountForCompact(const int boolIndex, int* dev_indices, const int* dev_bools, int* remainingElementsCount) { + *remainingElementsCount = dev_bools[boolIndex] == 1 ? boolIndex : boolIndex; +} + +void deviceScan(int n, int* dev_odata) { + + int height = ilog2ceil(n); + int ceilPower2 = 1 << height; + + for (int level = 0; level < height; ++level) { + upsweep << > >(ceilPower2, level, dev_odata); + cudaThreadSynchronize(); + } + + // Set the root to zero + cudaMemset(dev_odata + (ceilPower2 - 1), 0, sizeof(int)); + + // Downsweep + for (int level = height - 1; level >= 0; --level) { + downsweep << > >(ceilPower2, level, dev_odata); + cudaThreadSynchronize(); + } +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { // TODO int* dev_odata; - int height = ilog2ceil(n); - - int ceilPower2 = pow(2, height); + int ceilPower2 = 1 << height; cudaMalloc((void**)&dev_odata, ceilPower2 * sizeof(int)); + + // Reset to zeros cudaMemset(dev_odata, 0, ceilPower2 * sizeof(int)); - cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); - for (int level = 0; level < height; ++level) { - upsweep << > >(ceilPower2, level, dev_odata); - } + // Copy idata to device memory + cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); - cudaMemset(dev_odata + (ceilPower2 - 1), 0, sizeof(int)); - - for (int level = height - 1; level >= 0; --level) { - downsweep << > >(ceilPower2, level, dev_odata); - } + deviceScan(n, dev_odata); + // Transfer data back to host cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_odata); @@ -76,7 +96,43 @@ void scan(int n, int *odata, const int *idata) { */ int compact(int n, int *odata, const int *idata) { // TODO - return -1; + + int height = ilog2ceil(n); + int ceilPower2 = 1 << height; + int *dev_bools, *dev_indices, *dev_odata, *dev_idata; + cudaMalloc((void**)&dev_bools, sizeof(int) * ceilPower2); + cudaMalloc((void**)&dev_idata, sizeof(int) * ceilPower2); + cudaMalloc((void**)&dev_indices, sizeof(int) * ceilPower2); + cudaMalloc((void**)&dev_odata, sizeof(int) * ceilPower2); + + // Transfer idata from host to device + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + // Set all non-zeros to 1s and zeros to 0s. This is our pass condition for an element to remain/discard after compaction + Common::kernMapToBoolean << > >(ceilPower2, dev_bools, dev_idata); + + // Compute indices of the out compacted stream + // Reset to zeros + cudaMemset(dev_indices, 0, ceilPower2 * sizeof(int)); + // Copy dev_bools to dev_indices to device memory + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + StreamCompaction::Efficient::deviceScan(ceilPower2, dev_indices); + + // Move elements that are not discarded into appropriate slots based on scan result + Common::kernScatter << > >(ceilPower2, dev_odata, dev_idata, dev_bools, dev_indices); + + // Transfer output back to host + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + // The max value of all the valid indices for the compacted stream is the number of remaining elements + int remainingElementsCount = findMaxInDeviceArray(ceilPower2, dev_indices); + + // Cleanup + cudaFree(dev_idata); + cudaFree(dev_indices); + cudaFree(dev_odata); + + return remainingElementsCount; } } From 5125f043ca384921e78304a52bf18ece2db5c275 Mon Sep 17 00:00:00 2001 From: Trung Le Date: Sun, 25 Sep 2016 18:46:00 -0400 Subject: [PATCH 08/15] Adding profiling code --- src/main.cpp | 111 ++++++++++++++++++++++------ stream_compaction/CMakeLists.txt | 3 +- stream_compaction/common.h | 8 +- stream_compaction/efficient.cu | 60 +++++++++++++-- stream_compaction/naive.cu | 19 ++++- stream_compaction/profilingcommon.h | 10 +++ stream_compaction/thrust.cu | 13 ++++ 7 files changed, 189 insertions(+), 35 deletions(-) create mode 100644 stream_compaction/profilingcommon.h diff --git a/src/main.cpp b/src/main.cpp index 675da35..337acde 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,12 +11,21 @@ #include #include #include +#include #include "testing_helpers.hpp" +#include int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; - const int NPOT = SIZE - 3; - int a[SIZE], b[SIZE], c[SIZE]; + const int SIZE = 1 << 10; + const int NPOT = SIZE - 5; + int* a = new int[SIZE]; + int* b = new int[SIZE]; + int* c = new int[SIZE]; + +#ifdef PROFILE + printDesc("PROFILING ON"); + printf("\n\n"); +#endif // Scan tests @@ -27,17 +36,41 @@ int main(int argc, char* argv[]) { genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; - printArray(SIZE, a, true); + //printArray(SIZE, a, true); - zeroArray(SIZE, b); + zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printArray(SIZE, b, true); + +#ifdef PROFILE + auto begin = std::chrono::high_resolution_clock::now(); + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { +#endif + + StreamCompaction::CPU::scan(SIZE, b, a); + +#ifdef PROFILE + } + auto end = std::chrono::high_resolution_clock::now(); + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin)/PROFILE_ITERATIONS).count() << " ns" << std::endl; +#endif + + //printArray(SIZE, b, true); zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printArray(NPOT, b, true); + +#ifdef PROFILE + begin = std::chrono::high_resolution_clock::now(); + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { +#endif + StreamCompaction::CPU::scan(NPOT, c, a); +#ifdef PROFILE + } + end = std::chrono::high_resolution_clock::now(); + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() << " ns" << std::endl; +#endif + + //printArray(NPOT, b, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -66,14 +99,14 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); + StreamCompaction::Thrust::scan(SIZE, c, a); + //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); + StreamCompaction::Thrust::scan(NPOT, c, a); + //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -85,28 +118,55 @@ int main(int argc, char* argv[]) { genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; - printArray(SIZE, a, true); + //printArray(SIZE, a, true); - int count, expectedCount, expectedNPOT; + int count = 0, expectedCount, expectedNPOT; zeroArray(SIZE, b); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - expectedCount = count; - printArray(count, b, true); +#ifdef PROFILE + begin = std::chrono::high_resolution_clock::now(); + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { +#endif + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); +#ifdef PROFILE + } + end = std::chrono::high_resolution_clock::now(); + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() << " ns" << std::endl; +#endif + expectedCount = count; + //printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); zeroArray(SIZE, c); printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - expectedNPOT = count; - printArray(count, c, true); +#ifdef PROFILE + begin = std::chrono::high_resolution_clock::now(); + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { +#endif + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); +#ifdef PROFILE + } + end = std::chrono::high_resolution_clock::now(); + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() << " ns" << std::endl; +#endif + expectedNPOT = count; + //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printArray(count, c, true); +#ifdef PROFILE + begin = std::chrono::high_resolution_clock::now(); + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { +#endif + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); +#ifdef PROFILE + } + end = std::chrono::high_resolution_clock::now(); + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() << " ns" << std::endl; +#endif + //printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); @@ -120,4 +180,9 @@ int main(int argc, char* argv[]) { count = StreamCompaction::Efficient::compact(NPOT, c, a); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + + delete[] a; + delete[] b; + delete[] c; } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..a97c3e4 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -1,6 +1,7 @@ set(SOURCE_FILES "common.h" "common.cu" + "profilingcommon.h" "cpu.h" "cpu.cu" "naive.h" @@ -13,5 +14,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_52 ) diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 35fb581..26a03e1 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -7,10 +7,14 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define imin(a, b) (((a) < (b)) ? (a) : (b)) +#define imax(a, b) (((a) > (b)) ? (a) : (b)) + #define BLOCK_SIZE 128 -#define BLOCK_COUNT(n) ((n + BLOCK_SIZE - 1) / BLOCK_SIZE) +#define BLOCK_COUNT(n) (((n) + BLOCK_SIZE - 1) / BLOCK_SIZE) -#define imax(a, b) (((a) > (b)) ? (a) : (b)) +// Milliseconds to nanoseconds +#define MS_TO_NS(ms) ((ms) * 1000000) /** * Check for CUDA errors; print and exit if there was a problem. diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index e0ccc15..22ca44b 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,8 @@ #include "common.h" #include "efficient.h" +#define PROFILE + namespace StreamCompaction { namespace Efficient { @@ -77,12 +79,36 @@ void scan(int n, int *odata, const int *idata) { // Copy idata to device memory cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice); +#ifdef PROFILE + // CUDA events for profiling + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); +#endif + +#ifdef PROFILE + cudaEventRecord(start); + // -- Start code to profile +#endif deviceScan(n, dev_odata); - +#ifdef PROFILE + // -- End code to profile + cudaEventRecord(stop); +#endif + + +#ifdef PROFILE + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("Runtime: %d ns\n", (int)MS_TO_NS(milliseconds)); +#endif // Transfer data back to host cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + // Cleanup cudaFree(dev_odata); + } /** @@ -108,6 +134,18 @@ int compact(int n, int *odata, const int *idata) { // Transfer idata from host to device cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); +#ifdef PROFILE + // CUDA events for profiling + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); +#endif + +#ifdef PROFILE + // -- Start code block to profile + cudaEventRecord(start); +#endif + // Set all non-zeros to 1s and zeros to 0s. This is our pass condition for an element to remain/discard after compaction Common::kernMapToBoolean << > >(ceilPower2, dev_bools, dev_idata); @@ -121,17 +159,29 @@ int compact(int n, int *odata, const int *idata) { // Move elements that are not discarded into appropriate slots based on scan result Common::kernScatter << > >(ceilPower2, dev_odata, dev_idata, dev_bools, dev_indices); - // Transfer output back to host - cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); - // The max value of all the valid indices for the compacted stream is the number of remaining elements int remainingElementsCount = findMaxInDeviceArray(ceilPower2, dev_indices); + +#ifdef PROFILE + // -- End code block to profile + cudaEventRecord(stop); +#endif + + // Transfer output back to host + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); // Cleanup cudaFree(dev_idata); cudaFree(dev_indices); cudaFree(dev_odata); - + +#ifdef PROFILE + // Print runtime result + cudaEventSynchronize(stop); + float milliseconds; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("Runtime: %d ns\n", (int)MS_TO_NS(milliseconds)); +#endif return remainingElementsCount; } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d29d2c..9d75938 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -26,6 +26,10 @@ __global__ void naiveScan(int n, int offset, int* odata, const int *idata) { void scan(int n, int *odata, const int *idata) { // TODO + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + int* dev_odata1; int* dev_odata2; @@ -34,11 +38,11 @@ void scan(int n, int *odata, const int *idata) { cudaMemcpy(dev_odata1, idata, n * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_odata2, idata, n * sizeof(int), cudaMemcpyHostToDevice); - + + cudaEventRecord(start); int height = ilog2ceil(n); - //height = 2; for (int level = 1; level <= height; ++level) { - int offset = pow(2, level - 1); + int offset = 1 << (level - 1); naiveScan << > >( n, offset, @@ -49,12 +53,19 @@ void scan(int n, int *odata, const int *idata) { if (height % 2 == 0) { Common::inclusiveToExclusiveScanResult << > >(n, dev_odata2, dev_odata1); + cudaEventRecord(stop); cudaMemcpy(odata, dev_odata2, n * sizeof(int), cudaMemcpyDeviceToHost); } else { Common::inclusiveToExclusiveScanResult << > >(n, dev_odata1, dev_odata2); - cudaMemcpy(odata, dev_odata1, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaEventRecord(stop); + cudaMemcpy(odata, dev_odata1, n * sizeof(int), cudaMemcpyDeviceToHost); } + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("Runtime: %d ns\n", (int)MS_TO_NS(milliseconds)); + cudaFree(dev_odata1); cudaFree(dev_odata2); } diff --git a/stream_compaction/profilingcommon.h b/stream_compaction/profilingcommon.h new file mode 100644 index 0000000..ede143c --- /dev/null +++ b/stream_compaction/profilingcommon.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +#define PROFILE + +#ifdef PROFILE +#include +#define PROFILE_ITERATIONS 1000 +#endif \ No newline at end of file diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 16bbd17..473f9ba 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -5,6 +5,7 @@ #include #include "common.h" #include "thrust.h" +#include "profilingcommon.h" namespace StreamCompaction { namespace Thrust { @@ -20,8 +21,20 @@ void scan(int n, int *odata, const int *idata) { // Convert to device vector thrust::device_vector dev_idata(idata, idata + n); thrust::device_vector dev_odata(odata, odata + n); + +#ifdef PROFILE + auto begin = std::chrono::high_resolution_clock::now(); + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { +#endif + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); +#ifdef PROFILE + } + auto end = std::chrono::high_resolution_clock::now(); + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() << " ns" << std::endl; +#endif + thrust::host_vector host_odata = dev_odata; cudaMemcpy(odata, host_odata.data(), n * sizeof(int), cudaMemcpyHostToHost); } From 435a0d8f9c091b46c9638c34a02d52852e488130 Mon Sep 17 00:00:00 2001 From: Trung Le Date: Sun, 25 Sep 2016 20:07:50 -0400 Subject: [PATCH 09/15] Clean up profiling --- src/main.cpp | 157 +++++++++++++++++++++++++-------- stream_compaction/cpu.cu | 8 +- stream_compaction/efficient.cu | 8 +- stream_compaction/efficient.h | 4 +- stream_compaction/naive.cu | 4 +- stream_compaction/naive.h | 2 +- stream_compaction/thrust.cu | 6 +- stream_compaction/thrust.h | 2 +- 8 files changed, 136 insertions(+), 55 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 337acde..3cb598c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,13 +16,16 @@ #include int main(int argc, char* argv[]) { - const int SIZE = 1 << 10; - const int NPOT = SIZE - 5; - int* a = new int[SIZE]; - int* b = new int[SIZE]; - int* c = new int[SIZE]; + const int SIZE = 1 << 16; + const int NPOT = SIZE - 3; + int a[SIZE]; + int b[SIZE]; + int c[SIZE]; #ifdef PROFILE + float timeElapsedMs = 0; + float totalTimeElapsedMs = 0; + printDesc("PROFILING ON"); printf("\n\n"); #endif @@ -36,7 +39,7 @@ int main(int argc, char* argv[]) { genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; - //printArray(SIZE, a, true); + printArray(SIZE, a, true); zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); @@ -51,10 +54,10 @@ int main(int argc, char* argv[]) { #ifdef PROFILE } auto end = std::chrono::high_resolution_clock::now(); - std::cout << "Runtime: " << std::chrono::duration_cast((end - begin)/PROFILE_ITERATIONS).count() << " ns" << std::endl; + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin)/PROFILE_ITERATIONS).count() / 1000000.0f << " ms" << std::endl; #endif - //printArray(SIZE, b, true); + printArray(SIZE, b, true); zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); @@ -67,46 +70,109 @@ int main(int argc, char* argv[]) { #ifdef PROFILE } end = std::chrono::high_resolution_clock::now(); - std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() << " ns" << std::endl; + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() / 1000000.0f << " ms" << std::endl; #endif - //printArray(NPOT, b, true); + printArray(NPOT, b, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + StreamCompaction::Naive::scan(SIZE, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + StreamCompaction::Naive::scan(NPOT, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + StreamCompaction::Efficient::scan(SIZE, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + StreamCompaction::Efficient::scan(NPOT, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + StreamCompaction::Thrust::scan(SIZE, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + StreamCompaction::Thrust::scan(NPOT, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -118,7 +184,7 @@ int main(int argc, char* argv[]) { genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; - //printArray(SIZE, a, true); + printArray(SIZE, a, true); int count = 0, expectedCount, expectedNPOT; @@ -132,10 +198,10 @@ int main(int argc, char* argv[]) { #ifdef PROFILE } end = std::chrono::high_resolution_clock::now(); - std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() << " ns" << std::endl; + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() / 1000000.0f << " ms" << std::endl; #endif expectedCount = count; - //printArray(count, b, true); + printArray(count, b, true); printCmpLenResult(count, expectedCount, b, b); zeroArray(SIZE, c); @@ -148,10 +214,10 @@ int main(int argc, char* argv[]) { #ifdef PROFILE } end = std::chrono::high_resolution_clock::now(); - std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() << " ns" << std::endl; + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() / 1000000.0f << " ms" << std::endl; #endif expectedNPOT = count; - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); zeroArray(SIZE, c); @@ -164,25 +230,40 @@ int main(int argc, char* argv[]) { #ifdef PROFILE } end = std::chrono::high_resolution_clock::now(); - std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() << " ns" << std::endl; + std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() / 1000000.0f << " ms" << std::endl; #endif - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + count = StreamCompaction::Efficient::compact(SIZE, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - //printArray(count, c, true); +#ifdef PROFILE + totalTimeElapsedMs = 0; + for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { + timeElapsedMs = 0; +#endif + count = StreamCompaction::Efficient::compact(NPOT, c, a, &timeElapsedMs); +#ifdef PROFILE + totalTimeElapsedMs += timeElapsedMs; + } + std::cout << "Runtime: " << totalTimeElapsedMs / PROFILE_ITERATIONS << " ms" << std::endl; +#endif + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); - - - delete[] a; - delete[] b; - delete[] c; } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 0b46006..294f06d 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -39,13 +39,15 @@ int compactWithoutScan(int n, int *odata, const int *idata) { */ int compactWithScan(int n, int *odata, const int *idata) { // TODO - memset(odata, 0, n); - for (int i = 0; i < n; ++i) { + memset(odata, 0, n * sizeof(int)); + + for (int i = 0; i < n; ++i) { if (idata[i] != 0) { odata[i] = 1; } } - int* scanResult = new int[n]; + + int* scanResult = new int[n]; scan(n, scanResult, odata); int remainingNumberOfElements = 0; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 22ca44b..3f23434 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -66,7 +66,7 @@ void deviceScan(int n, int* dev_odata) { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { +void scan(int n, int *odata, const int *idata, float* timeElapsedMs) { // TODO int* dev_odata; int height = ilog2ceil(n); @@ -101,7 +101,7 @@ void scan(int n, int *odata, const int *idata) { cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); - printf("Runtime: %d ns\n", (int)MS_TO_NS(milliseconds)); + *timeElapsedMs = milliseconds; #endif // Transfer data back to host cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); @@ -120,7 +120,7 @@ void scan(int n, int *odata, const int *idata) { * @param idata The array of elements to compact. * @returns The number of elements remaining after compaction. */ -int compact(int n, int *odata, const int *idata) { +int compact(int n, int *odata, const int *idata, float* timeElapsedMs) { // TODO int height = ilog2ceil(n); @@ -180,7 +180,7 @@ int compact(int n, int *odata, const int *idata) { cudaEventSynchronize(stop); float milliseconds; cudaEventElapsedTime(&milliseconds, start, stop); - printf("Runtime: %d ns\n", (int)MS_TO_NS(milliseconds)); + *timeElapsedMs = milliseconds; #endif return remainingElementsCount; } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..35d9b46 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,8 +2,8 @@ namespace StreamCompaction { namespace Efficient { - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, float* timeElapsedMs); - int compact(int n, int *odata, const int *idata); + int compact(int n, int *odata, const int *idata, float* timeElapsedMs); } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9d75938..bf3790a 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -23,7 +23,7 @@ __global__ void naiveScan(int n, int offset, int* odata, const int *idata) { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { +void scan(int n, int *odata, const int *idata, float* timeElapsedMs) { // TODO cudaEvent_t start, stop; @@ -64,7 +64,7 @@ void scan(int n, int *odata, const int *idata) { cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); - printf("Runtime: %d ns\n", (int)MS_TO_NS(milliseconds)); + *timeElapsedMs = milliseconds; cudaFree(dev_odata1); cudaFree(dev_odata2); diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..25b7c4e 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,6 +2,6 @@ namespace StreamCompaction { namespace Naive { - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, float* timeElapsedMs); } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 473f9ba..a109198 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -13,7 +13,7 @@ namespace Thrust { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { + void scan(int n, int *odata, const int *idata, float* timeElapsedMs) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); @@ -24,15 +24,13 @@ void scan(int n, int *odata, const int *idata) { #ifdef PROFILE auto begin = std::chrono::high_resolution_clock::now(); - for (auto it = 0; it < PROFILE_ITERATIONS; ++it) { #endif thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); #ifdef PROFILE - } auto end = std::chrono::high_resolution_clock::now(); - std::cout << "Runtime: " << std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() << " ns" << std::endl; + *timeElapsedMs = std::chrono::duration_cast((end - begin) / PROFILE_ITERATIONS).count() / 1000000.0f; #endif thrust::host_vector host_odata = dev_odata; diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index 06707f3..44d7fcf 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -2,6 +2,6 @@ namespace StreamCompaction { namespace Thrust { - void scan(int n, int *odata, const int *idata); + void scan(int n, int *odata, const int *idata, float* timeElapsedMs); } } From cd40d4ee4d90fe9c0219584a97906f063e85b3d4 Mon Sep 17 00:00:00 2001 From: Trung Le Date: Sun, 25 Sep 2016 22:48:44 -0400 Subject: [PATCH 10/15] Add profile analysis images --- images/BlockSizePerformanceAnalysis.png | Bin 0 -> 14554 bytes images/CompactPerformaceAnalysis.png | Bin 0 -> 13411 bytes images/ScanPerformaceAnalysis.png | Bin 0 -> 22870 bytes stream_compaction/radixsort.cu | 18 ++++++++++++++++++ stream_compaction/radixsort.h | 7 +++++++ 5 files changed, 25 insertions(+) create mode 100644 images/BlockSizePerformanceAnalysis.png create mode 100644 images/CompactPerformaceAnalysis.png create mode 100644 images/ScanPerformaceAnalysis.png create mode 100644 stream_compaction/radixsort.cu create mode 100644 stream_compaction/radixsort.h diff --git a/images/BlockSizePerformanceAnalysis.png b/images/BlockSizePerformanceAnalysis.png new file mode 100644 index 0000000000000000000000000000000000000000..44952ea577a61437b3ffaed24ff2c327e3fa4f58 GIT binary patch literal 14554 zcmb_@by!qe+qZ%ON*G9&aYUqRK%@i|3_xlKQ9@v7kd~oT4j@u0IW#CWbmxGh2r_gx zD9F$Z(#^Lvderlr^M3F1UDx*y*SKA?_nNitb^qeNx4(+A0_AbWHg+;BA#e7u}O5 z@5Rc~+cOIW_Wg0g&e3s*LE+l`VM;(dSiCY2C5Ir(|wn~%czJ5F1vg^mv|4?Ebz zt?SjGk=_0?YwO~ZUT8hfRXE0_W@~hDK>$4WkV@8&_Yvp=Jva21f)@HqLJs}q2h;oe zb%VVBy#CJMFu3&JTz`Tk0waGPf5`hNxbpk;9}oU7uK#VK2jl-QKj3-O*6OU19~Dc} z=uY!!)aW*5RA_&RZ~tNS_AntjM0&4ppSWz2ReuL>yO+B8L3&3{9fNcd^4MBwa~j`^ zI?cVmR$5~nxl7pJv$YzTA1rNV+ihp--}G7}^)j#-uAtE^@z{12*B_*5JR#XoerbSR z^@@n~fH9-6nL+ipAY1KSK|w*;Nty|X+eC3j0kk~lv(p^Ry3$|S?xoy`oO4Mt*$Ry2 z?G&v&-BpopHCWOtv9tIApS!=6+nli5pMYXlijJCSjHJ`H-QU|?yV^6kk+8p+P`152 zlbPGZw*QUI&vS%JD<{fsybfjSxg3Lc@tky-6WCszHOgwl%VD1&WfB$YBQSn`nAt#Mbh$K4LOJm51x(1 z4=0S|XFk&D3o8UO3qnMWX8$(MYo_aRWxUN)Z_a~&`SG?S zIefbU8_a8O(#vMOyV}ET*$ZrL(&VE4{;EEH#ElR{Lxy$O{_K6isyhQMCAB?Qc%C7P z1I1gKkmrFr)0?4Qzq3Ht%h|op*;kdoWfeoir5R<>jqYzvd~e@hV3IWX7OhFs(y?@R z@9W(;!gv!($gWj9Fkx>c%~=P;q{nh=yrgAjh-1rSOI%^M<`zYC0?y@hY%T@vYeQ&r zY4s}4_NK;5F0KpXxfVPua>fu(M!JjX*;=~D6Mx&)x{RQv7 zndBf^Vi}N4O-+j8d?QX@rn)n=hTJGYdcv_-2z@DF_h~SgO32uIRY4JPJZOwxI~fFp z^Ja`~wRu_$%o357rigl||3LTb`iX%U%674gMG9JZ96J3oWxV&c%bxB2n(fgR>*o)rASlu{!f`sR4ranzNZ--PEip=H>k#re>;6ZS^%)qTR zu>$sC6W*tDs2@^r0BYDe} zPP4RIV#X~W*2EpB+Mq2PRZykWq{rR1+`U(9@p12(`8~vLOPsjKir#Y7Wb#}dnt6P; zZhw!kKe98Szn20t3t|#2>3{Sf2D^5joR&LpaAq4MdChVKQMuujXK*ne7-@w+Z6wxy zKdO5}Pt?mqD(tZ8n%vPa!LuVo8b?#*P^6~sjnmKIR?On}ZjrVcE`G+AU{@y-ycS|= z9)El2ruY#J&wODOd)i}-V|mDQegsq5Tu}w~OJJPdHm07|KQ;XFQu1^jr+}YjTXrxlF1>=&g5gCvGM(~u#NBA&%PE96W$lE~U6k5t z-1_#SW@fG)W?+5tA`Zrcz&%K9vMlI|s=gwg3?d>?k)GSO``#I&&$4xLa6ugJQ8$kA z_*?!I{^+$ayRq71mG@B>c4uAoOE5lfj@!X~rZA<}e#d2+2P<6w2vmD)&sEiISJrSX zx>B)9MDYiVH8%|_d9OlQ+j&5DH8ks_4qT%MoHOcv+0ehXOIfbkesys*~P_ ziAl5zS*++HZ=Qf(vgrTw7`6K{I3XWVShY%y-lymFN_g!LpR`vTCcckQ>t3~3PASpP zc@QDz6)1eSmkT-uk``L0t)yR}1)lrO2IpqUo`n7B1XuHa6x8dq+>?n8|1(BYL@pO* z*|TDu=G7WasPK0S$sjtn;ri^u(Z|x21EW)`lhXUHIG?+mv+}6PmKzM=>D$4PhAOoD zWkD`Ag4cXD_u?+EQoZ>Wny~Xn(yhgk##?J~MV4FE0s$Nck93HQH@OtcN^49GE$V z;_h{=OES8&6$|j5LrdaYE%jPz8WBH^5=KM)WiNxYzT-6G!*klhr+t|`u`~lY*BmJ` z;=ZVqyla%+`5=Aflfg6wf7$Z%j)Wk!VS?_58&!K-vuhr^TdRZ$tUW5~jY5e0NNFQJ z1M7mG)QckM!(Yf;cpIPYulUTRD$5w0RnKlTe11p9o20*m%;GM+MC*8c$l&P90u!9x zG#u5=Aqj)~q_aLo!oN#oX zGbzZsb2692Jx#Lw2GJS$ycgJux}z5GZM>!sAFNdpyH2TqQ(jzluNJoQO0sGMg6HfG0NP zl)iboeixo>p#xZ57+074A9@HGpRrcYwL)1GS#gMcH??7sSLqB*BpCy9v63B}JO|e* zZWAWRJ~G3R#Cdm}AlcXQk#hUxZNmk|^NsZET<*6EC?o24j?dmm57eu&-rKI(7k6Z$ zba{!U%8_wRmhW%Tu9UeB`}0|53#FXATw^y=9zRK@QjEw^shC-rnd~i1TNT1pLlm4Z zht86}ila!{uuZugKDPr> zxSvfL#fyTYIPFhO)0kP;2z35QWY=?i#`@r}pth{_ldhM#_s}>Mk~-a8>XM))pSlz3 z+iXE}G|bK7Xex`TY=TvlEeMtQxVa$y%4x?Q_Im zuL?hQ3*?MRk)|*VSv)4kzSnRmp(s@?K@08;V!y;UnS+E2#Hj`V+r5k~iij&7TSXY< z;j)KY7sh;@A5FQnm&~wGKxF&osa_+M{`UsqpND1P?lb-~1<~02(nRBSwnN2|odntL zNQm1%k^L-B*86q4QFk#{RA!*Fct)11XDfX5140RLsOG59IYWRKTs^BTWZ6T z*Ag%3rU=-XQgbv+Nm)5q4CJ<_4rEJfgSIh%2A=XhIz-ma+{V z4|3lt;#d$c&XEscwO$x1_r?<0{Aort6@*tQp^UiVwO0l4?mxU??i-H^jT=snR+T$U zMjL+c{g_I6f^|+vU6$RlDM}!zK1$N9WO}%=v>a%Kd+w3++%|tc!B0_|1!Fm19}rTM zlt)Dn2T3O2+U$IMt)c`V7gE8E^36?(loO=l77HtlnxbKi#wI5HiuWp>V8`Ui%z{Ur z;OVo@7h~>U!cJ&L$y(Qg{mSz|7_3P z-)C&k#0_uFDAhmXow5dBo@`xjjuA2qjw7hJW%-cPZT0k*SjRMc`n1R*hgZH|>D;_B z)&3#&s?+gbTQkSR-Aj@Q_VX1*&))8~wOgjsY!7N@$9*tHMJF3Xz{j3;w`ScMPfR&- z70ptRWYQRc_%`;@F2rtQ(a?>m#nCYD3Ddb~V&ogKuznHk8Oc73o6BG$pH;aiGvoU0 zaea99z3yt8{(_Wu0zfdH(Mjn(LaceoYC_D^M-@;#}_37t=oC{M7yg_Ghyc4=RGt@3_zntO+V%kD8`<~CQjdNa=0!Pq#Jq(3n=rvfK>2A!@ z!(3HM`N!aa)@c zcu?sK19YyaF@#N8&p_IumiH1pgiE!2?98S<}eL-oKEvX|6WFis1 zs>Nrv)g2l1+!VAbAh5!2VBnJJ&MH8zhL=D3^2~KQB{Iu}EMj_zl!1Yv=Lt=>L9tbT z$6R0j2gY=8gJ@PsWNn0blrPAE10)O6*V3`KdsfxPRfNsj-Z7HGgvIT@K5uAg5dluf z(!A-rCx=DeGZr~bwPAbeY3mGJg|02*n+sQKrUL^LizB z#BSW9ePmO$m1jhNUmKhTHjvYNSI&1CT%1Mf)OdP0Y^_ZHDXy3Q(8s!sJ?yT{+f_lr2;yR?H!Fmw60j8ZQ7zI6`p{S{hQSX_HMTqf+Wt zsqHN5vo7-kFx}$m?4Hj2Z~=8r;hR>X`Jru_& z<8qfZolJoNm7SpuOVqUdp{(Dpi^wEqvd9s0N+E}xo3bi^5V?bdkuWW*Fb zbSQmJB#I&BqM5RcAMkxlRfAMu&K zL4RFtG1CrH@d$SY>GxIZu7Y)LysNAc!t9xl%$MwUR3RbfaBl-d#{1~~D8Lnh@Gtd{qw~_ykjqtNcS4nw2+G=+ z#wXb0ROWB(crdLb$~_1qIN{D@96@S{BEtuKNx5K>)sJO-zI0TCuX^MBU%oY` zb<+fdVEnEC$}go(q3Rf`Z?Vh@e=d=~bmh_KYgTa_t&&u%a`@YlbK^le{!kVN(>i!; zAoet4)csam(8E066KpmS+&W`1X@)O^TT(CJ96~VlZI2X?Tek`>H3Mo1VwLDSY#HVW z9|^6qUI`wK0)kP%5v$M|cg60#RCDdAiSqgddd75dF0MvIN5|0x`~k`uo`5)#8WT62 z;8j(Mh=B)3g_H!%M-<-Oa9zk3PmLeXJY3d!>~DkZNi0E7ua#GT@dfHQZP$k8=3?s{ z=7xB;x!|N3c>(i|w?$hRd(ryua0(I7SAzx_NM+1!GCg;3ZpZjk)s#PfG~wpjkdX_; z`d4~M0r}QdN-0;8AJ}ggf}eVT5Yo2r>dw|J1sp2msruhI43Fe&->bRudc*oF?i+cM zexJv;4hY2ti#kLoA{o3VWHF?xf%jLJ$aNPe{pv9Gt-?LC5j?ubnJjL9Cc>*IOu8cPa1BrqJ3}IHU_meg>QcrJ z!la$4xt)n$`<;(JG#oHkED^8b0UcBz++>zSb_II?$^>&5^;k#6#xgqqFRss?Ko2(@ zH(Vu*N-q>#_YRv;6$5#Smwz8{4?*jJqI@HS(_CM>``+ecz6mndPevJzft@Z`>51vZZ^z=A(SqpPYUgbd#w5%S#GQ1PCa0B&b0Ab$Ci6zqcPGQw(Z z04U^2U8j=#<~}Vlu6hgx0`#>*ivfSocP9VNWDgcq z-1>^!T9yYs0X@g5nPrx_TXn*cDmqa;O&JBqNz zDox4-cTp&mostT0*_vj$Gu1`zWH276mxjeI{!4hRdcC#NG=5kDm24w`Aq&|>=+R}CcFn(X4K~g>L zGW`l2^8hrdzUu_dAn(k3A98$3q+Y=Xkt0fN-^S~eDsoyrJVIQq?pL4%F^GAw^gxvaDGw|0%*10efH=H&*8JFgpl$5dztmwn zq8=bAEyF?%HWAV=Q_S4FQfDgw-9rEod9}w<2c6WN1Z4*2 zI3OfZjgfpY7Zie{=H{#l&DxX8>R*;KN+OGZsH8eb+z=Ma0KGE?ibAO%ynuXK{PLms z^~F2x%lMW!OJ7PR+89=e$9b)a$52^-v;N5nNx(I1hp6p{VgTVZ$i(LXeK!EPukrOe9Xh-l*u;Gms}>De~!=WuBP3^f`hWSZLT;t%G0>IgJ_AXQECM6CJ@ z7-1H`bGs~_5*QviG<1Iuk4y51lW^@Q4Ia>p6L)EGp@56NxE0Et(oLZGea1H4DKlcL zYXJ}Qx~eVqb~dYB9q>*0%PbZl#gD$oj$Va$CLqq$MxTcK!7`%&U5)|=(GW_ZCn043i}BTlH9?NR=N;u+%|o9)_vv1^~`rNfyf)^<hpDd_p%)d3wx9+gDJa`lW^AZ@n+jk7MKSd)RwRj@6yB)Max@mNUHf<2_U&RaXiJo_Q&N-oaK=Q}X}s%s4w`PEGcAP0u%|M< zu2ID!r>-AThO$EISW>pWWx`N-p{+sDLch0-TvDYuO! ziU~mT#(-+mLlB79zIcNSFwt+2@lLDfcJ`|D{?0N{*7|`ipfo3QUNX)K(7W6BSjb&gTg!`r~(pwjuNrLX%!m)JQ`GzoF6T}oFFffs8mlKrtMz>@X(z5&09rv zpgP?L-Go&AgELHWwEkQhXE_42%1Ybq$OoNM1$;F-1+HWMm*$ED^=2YCSs#CX8s%GR zJ6aM#^$xfWPu`P)1c=MWE}f@NN;5V^AnNcJnGE4NC~#&rDiCOaDm_Fd|;vKjc2&6nQ@r4YRa5A;)KNBo|xzUNLy!y!B|cq=`3; zt*%Mv&?XmC8fUXH3D-#e(4)P9R$wf!P1B=K!^0nrR(lvF>;mKv1n}UW-C1U5|&`yqONV$*W;sRr?vg{zWLp%>khhD4JH!VBj|K z(F2&Y41gF#PRFb5aP}Na1K0wAhkxo+yO-N0Fz#4>{q!m+RX#bx)WDEJv#;c;S6sk= zPKHO%1<+ZOXP$_hqHQYEGO$mq;dPC_n&tD!#NTdivO87GyB1!V6mD5je6#77xcy)5!?GsEmgB%oCFQSPKSm zx#{xCaXkys*w+RS@HwT}yU$#tW2QYfLY65=!2`@gehmzov^XB`cP!EOi>lix_CZbk zM8-adDKW|^(e)T$R(o_yDhh$p$zRiJ2u^a=9nhp2EjLwaRB??fSAUQR>4y&#lgcXf1sYm?P@76allY*RgAUoj80DBnX)q z{heVFb?@(jk?=uaq7Rf7cP&jk^m zzbP_sP1_+JV9ut6+YnL}@a6G7Aaz)Ek!puuX?e9fR4VZTucBIiKbN zU$A`#2#2miu~9vhxbXF`-HbOVlD##lf)n4r`Ta2%nN~vG0729hkw; zQ$N4I4!OJs?B~a5$d4@JhsaVhAmoR=Kl(HP9q>OAEbs*4f6%f2LS+2s)PK_$Kc@bd zB)J_{vFm*Lh|F6hP+)*o^VwOzN+6B^)`^S=(u)U1;}53s`+$FpJqQS3N7bpBZ$wPi zBUSyz2t+g>I8gu>Q9$!;MOgNO#u)cy;q1-egW82!%58R_UFU`?drKkXbj}`=Y9Bx1 zL3-hoYRj<$0&?8_n(fU71D30fpCHFB>s~PqvP+r8>}!opO=Ez)@Uce>_>~V#LAe7D z_yPjt&{WUeEj$2msW%-}t`3#=r&gbmvMw?(6SkQ|ZM2DdO00E98z_Guy%Kto!8z9W zy4WQOS{dN$E*{35yXXyjy!e@j{RZ9ysIdHLP-=|1cqd|`2PN*3!ysZ6)tYXG0Sy9u z<`|37Zxj(__7g8M?iHhLD}Yah2laqd@SB~lhA%4`=Z&Fqj3*rzhQvBSv0gn>lLyd8 zHmI9Nd>??naD6aKAtC%*hbQ=kNm>Z2q(x)^aP=)|S6#Q(3AVg?Ra4??mbn+LAmUypt`H8Yqjeo>gUXcr;4tPaKZi?^!a`=>r zExMb+F00QKNtKoyIDE~A^gxY5(+%PPM};dNeIWu-18D}pNg6;+rM)>ul-efISqtlw zqhDiFx*B~HK3;6zS*Og!wG3)}XZW>wi&`N|&zf1zmqGyOw^D+MgfA6KbcZW#kHN*O zX@CgU6=>oRF!PSuYAx7wqKx<4HpGx4^*#HcO1twthFri(``6LzPLrVgVQs02si$Jp z`mu?C5j@3OfVQ^m*UR>kEi9JoRs9$KD%z;(6(-IDk!YH^pwWFv?p6AWwg5p1m_^qj z11k*CyOTbt(?y{v;gHdK4(g~A3Wu2Oi^IQ->-vQhm^jEr>fz6iA$)|aRe$|3h}PUB>d+eA-0WYEY4;Xw0^ z)hG}Pk_Fh;n?R@ec&sMR$i#)juuVvevs|*^dw;ERELEUSG1ghggI0(2TIM3wN-%X$PU$*5FF#oal2i0GIOOS|9 z1_iRe4Q~VUJuo?cNq(p<{M%?^q!D!U|IQ%z-|$P}$p7Gy{XYQM(;ool^q&9*3Z)~^ zE2{5uu# zl(KZ^lX15u=2&cJ>KHMY?=P%(oFC}!%F^ll`t@rR=%S(1HvQpbKwepqWp9hsa7B0N z{YF6GCS%gfn_V>0RnE~uc9f2e&O!Cv8bmrGauyc8ys{up@87oMGU_4fDD~r{J{eKr z@CJ3IbDfDY#K6?zbl6-narpY_6!7M(X?gCQ(VhH^^>&W?mqPmn#Vs(GdHbYZQr5@s zOf{+qwnJrxl-eN0cuWczygS4Ms_}}O668j~MK(}pF*xDimqRg_2BMII^?yaPm;WuA z1&=YXH;;m1HG{C3%Ft?SoVY1+eZd-dbS5A-7_}rw6Lf9yO{i3;cFa|ZT%Gw7vEY3| zDlZ`$xRRhuf8ytXT{6oH^S|n`<@i1*40SyEDfK1gC}75HHF*oFwMy?-#r4dBV$&ES zw@DIIv<>n#`WN_mkI1V927$FRw-p<>fiTqesw0NV0GfWI$8BpmZGN(SO;E=_7$kQN z$g}{d$pp&4zhL4G_Vu|k&|~mEsB3|gUru#8u*LI?}wt*x*PR~s@V49{J zHzyJdx}TyzfAQeDpd~0iD@{uPZ^alnP-<(d@4amXNm|PkE6wkcw%Ih<$P!gxVIL=gBbfc#rRwnhVn_?#Yn2v;qKVj{z4>);#;^+pPnma?uIsUi)rkpZORz6m)h#xv} z9_lFH#`8R==n7;O6I^no^#iIzc9nCbMN9Key{b4@F&3D5j!8gvI6SC&++MYbkHG_m zg8jLs5{%sW=AG=~CE>`G4AIyE;EgYY>nh~xmRKEw(Xx5%8o1@t-7K>5|wuPz_e{Ks}b49=Yj8+h0zMz z{c}E2HUfwWXl)spxH(u}xLNTW*CFyg3b7wZX#G6zkgkAoIH)j?fPSn1(S>tl=N~-# zs~-X)-`~FjB`hd1pgi{1=kGlezs3Lo$M_GR`q4B2t{(yt1=@7bWbrrjgXGnZl=^Qq zj{iP;Llu#~DW?N8{WqQUn{qmQAe?^bEa-)e?=N-y^CgXcyM9v$^cOUbhe*diiUW0! zPBffL|JpYT<}T3-7l?Q7JijZoS-V0Q9fe*|Kzw?64&Pg|2TKln7VqCPE~>Y*^+te% z{PrtQ!+Wn6)xcW>PD(UL3}>~ESZ_Jp#K|S0eG@S7mz zwO3yOns7gIr|!<)dSD$r_f0wP6fbFD?Z|@h8w%1}udH)@_AZ8?6+Cyw*<*bx3|9J>T!}%G3U`ufaZs0ubq0Yv-ghCgT@kMf0yJ|B=<~DJm}xa zG)`JsQ7rnoHZR$IYFy%vd#qYDHZ>K|6(B2QzFBaC1WJwO*`nhE zdU{^MfM-D=^!<&X|KBg>{0h$h@{OYnB-E9}33$L+G$5@&DVQk7UNMXoHdlLC&o{kG z484lm^i4QTu4GeuVS$G_Eu;UMDq`{4REd@p5I5&>qfws zLbgA6SLfiZ4#2sM26vYtYlIg2-Y%RdGu}} z7YNvrXu+FZ%O?l?dIs~2{LI@D59Z7k{zy?b3(HQ__>(?A6a5R>V&mOPmrUQKqyD7W9*;86=0$wGW z8DtM%J_$N#MEd*t5unoM-f+<4*&&zu7Bov7#RFHp6YBk22XzgYBy#0Bk9v4)fjkww zG2?}D`fc7mvug0(!m;%l%U<3XS9^%x6#*jiP9!7NPbH=RKvE3Yr+W1H!v~8umM+-| zIw;7Syq23qtt&a8uJT&NxT~-d#ct15$@TkITc*<6CW`TszcR^u|t!55QwJX!EQ@37Wjf9#QWe0y)WwDfiOA{(8;+Zl7C{Mcw+SR}J(7 z6hIj?>u{&*r|X;nM-QH$mINpnsLqzYbB8l|f}WW*4Dw3`XM2))X_Hc=hx@mWbok2A z-5k&f`2q$*CzF4u^a}Es7}f_(GP13gyO&k&%9(T-FPZrr{n@4Z>oEQ|ujl<+hKKF| q&=mar9T5O#{m1oxw%NL4HoSZu%a9_JL%%|IxphPNdXB8Yv;PAgCZ6d4 literal 0 HcmV?d00001 diff --git a/images/CompactPerformaceAnalysis.png b/images/CompactPerformaceAnalysis.png new file mode 100644 index 0000000000000000000000000000000000000000..f597ed51243fe7150a18b6383cbf2f277b44f730 GIT binary patch literal 13411 zcmb7rc|4Tu+xH+#WGShTrG?5`*+Okl>8T-S9T=XoCI_xK*)kc&YsZ@_!+8XpfppAo-fYn+?ny(nKc9xADA`>+l*gUNza^$T_LhM z)>wNbf3%#u=P~@jY`$uWe`CfVOeAb?Jxo$=@Lt5B6Bkw2=7%giD06E&xx-<5AugSH zG20C)+Yx3NotaaTssA4uHHhhd^5ydq8oM=vA<9CETz%c9PEYw-U4`;Nq3>#+22dX5rTVM@~Y$ad?nv$H(g}Rt8O_2jl{$cH~#e z#gHg9bJ-uF)#}t)mj*_5dx8b+otzkD_NFq!tZd1|kgXQ!9gd})tj;y9&490QLXO)% zK0RyQ30$76LffP(_%+>zx3OQZNwN;@7sWk4Dc>;|Hj!ZR(Y)7?4)Jc_2HhwrbWLix znV6L&I2X?K&+3_nv`dD1NI6v`l#|rQa<0)$41CP~?c;oKQmfpbCt?kW|0>O(9?S#ndRYiM>cpBY?r}`EIq~Di4x1FtZ z?4mi3eehIyrB-n82a4aSN3{*BWsUzTnvKG)IWo0mxn?-rAZU{v)n*OfWVbdqpLGpY zTKVA>xc$>anLrT#r6$iuFczy&YDZO652=MCEN?QutJ3xb7 z@z>U(sH7zO34WYwPV}*riy6D!?3eS|jYlN;EU)b>&b4JnU@U8TC^8iu@w;)`v3tMR zNlEC$UlnJuV_>;>e^y8Flzi;J)}&xud4KG42`!?8!rS7H4<#@nB|MzQV)?Dw1Ci_V z)K%S)(D$De8gB9iYOp3!<|YZ7UVCd^j{L0Gvv;xmr0{P4NJQ%Od;x!b_>d|EjT_1!9$!6sSn2t6 zO$M^BJs`;W6FHz?Bal5JJd#H};z@0TnLb7;IZe6FFHOIb{Zc-=E~VsIWqL`dB?Ggm zXCM0|1=h97EA_>0?`TBRJaWB#d|H!UG8ie{Ip$i&>>m;l$XBR)XeV+(mz~BuI)1yo z(dA{kYA&Vl;qrBr%~vKDJ0#mJJgYvrh0XeZZE0EA`cU2KOU$_*%fMBFzMCEqSY$|- zn2JU%*rl;;&16M)1Pj=OS?!$SY+A#yp)3hVnejcDpvaER_8+*D#G#7!W+v1h4a^UB zhvj-=&u^`j4AZS`sQ;eMM|9%V&FLJjyqTFfa(+yuT#2dEjEf?@HJLR+pZDs6lCfx1 zH&Yk-z;wU+D1(^N88O4U4(pfMf97UyR_UJ1y}sDM9(}Up5rAr|M)ZMSYoaJzt(07} z8m{$ucU0qg$-AL7>OiZ)^eO9f0}{i^b$nrWC8?E&Zt^RB8tzb`<}*9rFkW`j#y;ngIkB(ARl%tSKY&A&iYDGBLAmuQ)N>Jq7;%Gx?lH{A;E{N$Q zubNDN5Y!s|SLou*=&Q`V?J36;eN!`T_N&kEM1St z`IgfyJhX?5T{l!)rO?ghk&~tgppOv*q1hfJS{zya>O}UKOx#WB~OlMc#)Nx zXR0d1bp>NpcHfUobo5%3*r-)teBiixa=BCNnAu~p%%j>y=e_{s^Nf3k-AhV&o@~jU z3}8m@D$o_TgiknHa&mHt-o%sdm_=u-?tNZ&*DP$!+40i|`F+?1E;_Ftqz>TaoCwYd z+A8NOG(F*A!84L}>DFOXmkL!yjPGXM%1&Loq#R*ydzlo?SAtvMPA3gZ;nTa@^JI-! zoh&tW+SvVg`w5F5S#0SsHa&8EI>`*GtoO+C+M;orZk7WftlXD#)yM9J3G1cUTlsXo zyTz5~V)y+zf8gI@SP48oVq<3^%!^0RI<_(g8c6J=GGas&S(z;Rq_`N`Kol+{mN@0t zzw$DBm!o}MEvxYZ@P-bInN;UB?X9q~>e~lp`<%~s4#`%xuFegsF1Un6KNgj;sWLNB zx^zwSAsL~=$g0ufK)}_C2Su8Wm{g~)V*P$Z?f5l7r<5KR?$(oBi=50@$PR%xvFa1# z%CxPB67mNNM6?BIF`R01NhLH)AR&MW+R(sY93|j?|MoF_IrWRfLVvbZgk)SZhX~;f zW-V=gg-6kU?&1z?rsBxzM?AT?+Df2JUvem0sg@$7fRl4ccWOj_ioqju3}qvF5J%hA~zCzW{Q+4hxZ%w z3SH)vGB7hsu3B6SsNJ%Z_nSTHB?9Jmil^a)6|CY`&b17dBn8Wl>Wwp3k*|zd4|8*K zzaXKa@hfeKwN@!b8!H;XVH46*D&b z_+3EdtIPz_4J5C+Ws=sbR@IBDPa^x~0Us-tY@*;id5VuoTFqTqpyEd)R5sgV&z?Ki zD5Wo{idmj16Cs?(4EM5pGAHh=S%8Z?9_=io+4mdEQ_I&F!1lboWD^27;yN=2@`T&o z?oP(WO65YfIO`Gf(xniILL{evfWV>*70yrDjTQ)r z>qPu8vU#wVIc~v6FYU^$r%#`b1NMLDR|?NDTwD&;C6YNPe3A7$2uQN)ihi^9iEzfE zn=@C6CQYU9O~1Eoh|*S5d+2h~lm61kROncY+|Bo0;38{6nFn3F-+RE#T7#Pt#h0JE z4ORO^8usSt@nV~L0b%s~+*@c8D|Z=qFGmcGE<3@DO+eZG^UU$u34HBNk)6<#lggG^ z?YQueP^-_ z@@MK1C65l8R;O3!C47^z|H47~MN;0n>G^&%iWyrl<{KhzKGd;<{BjtkTBm*##JT0x zw|J_@qt}%N-poA6zP9x$9CBh(rsBxtR?4QrF6~o^*jk5f=GUKG8;s4(kwXq`^(iV+ zQsqswJ?Aom@A4E_jOy?fSt$5Ue|H%#KBdE3ZIE%b|9Lsyt+VmhJROu9-#S`?8;a8s zHB`{&4Yt6{zFJ%GB}`TOQ&S3iaVB=SM|oqjJw+6`tnHskwHU+tjsb;vC_8XU^ZP;t z>0Pg76eIrwRjc47EguW^X87mI)e4^}{f$xfFz37(?#B}o6IPvcGTyx|$5^e1aFu_? zyzD*KcS$~I4dcQ%F~Fcp8tepAD@MwaB{x4mbqfTX_nuiyD=Ek~hBcx$Zr+q0V_-Wu zy}l8}#^*ZkRwFthYiif{VznhHEzSLN47b#0F$x-9*s=q%D&q&9ta9cDt2ZW7TX$|$ zHh7cy0|Ntpc4qE&J!nyC<@D{rAM3;?FceyN5xNK0 zQA6C)Zapoy_zsOg1ZgarL5#{sNwn}>S|Ol9Oe_~RaJKIeoj6;(e_<9r{Trfd&A4ZC z`cwlBCymv(uTLsU$|eb zc-7y>Sf7z4=7UDIGUVr&0FpfnkG$xDT_^6mqyYv3WIKd>6h4m!0k8ZLgtY%AhN#xj zf@_ejvj6S>@{wx`P_A=O+QJjC{p&ura`5}JXV2bnbw%4dI8>zo(TK_PmkPVmieY59 zj9ZTwkPlFh0`gYt*;#S%)^2GSFdDszL&^mH9A~D{6*Z2GjGXlM^eoQJ&3*s-`HBn# zc5PwA`jm~3kkHreLX%dis7djSyLaCfxDT7(4bc^PPch>}XT@V#rHZH)qPB`+wVC}O&6YyRjtT&0{2Tub9B)&7>AV?2; zJO72#JL_5^K;Y*32u4Dl;*wlOI9%7|-A_KToNF<0+uOlDijtBB*Pfpkb%kDZUQyA4 zjB`1|E9?I9$B(-{*S5i%Csuo{0}kn!tKKj$SQj}B0LQ4W43L+Hfp(VFzS|Ioxp$b}29qF8y4lyFpqqqM`O+p>=c2zY(t>5|J%1+&)7 z0b)Wk&`|j9@dp`pi1Y&J6x-QcJp;mW_;inV%e)TXcDpPbu{Ekn)6Byo?JO9wxpGO1 z9#YH%ZRFomD+g|l#P}nHw?PQ8WJGY1S*U=+N4)!1PDw=wgzJr08kXz&oZDF@|1e)^SJ+2 zRTaGUYmLL84q(GAQfRtbK83?I)0B+w?le%L`YHB3(~K68kaz((#a&4o`n!HtAz=qQ z&|Ve{y_VxPji{kt^X?Ubran+SaIW1tmG1pNehnU%xbqUk2JLdkR{OcWQtzc7TG$Q1 zgxryj@YcNT1#hoGLtf%53C`cDDXO2DNU}}@h@+japMG*G`I21}fSRup%}K2eek&Jq zLWnb+`MLlC+K#6PLksopkm*CRO;y!UnS<_y^XHjPe1k+=00UC^RgN4hY@2fC&eE<` z^uFD~HdX2K3;qP;>NwC1X#@tc)abCV{FsRyak~A5+zjuo_H&aDn7>okSn1aL%`1ro znw)A~+*6Mdt01b(2GGQ|E9;SSrH$juB$cF&4kI&Sa23cI8{u^wYo;3;8~dwa0n##A zQ{*vQb@fLD_8vpk-S;yfZSTz`yIk#fVeekcfUn1`gGWVdhJYs7@$S}{eFxK1gBIg| zVc)=|1a06l)QftGvXo&$%|_7Aac!(iLo~Y_9~uQz4AF?1WCqxi4=q&sGEoXGFYN)0 z=|%rJ)BgAObM6p>&4!T_an3R^up4nelF_+!>y?-}?nXyP#~nR7m}XDpkHU`o9ab*# zj(GcYG24jna4uaP9i5v(&2PET5ZP9glQS_gGWvbDZnC`aotTVR~T z)X%>;4IX^{2h2SPA`gCJdH9!76YLdDCR)Uy3lb6#ht>v;;yywgk|J8*Ju?OX*En+g z_>`;pMgN82);jRRVe=sLQ@-#W)(3Y@x+KmpnKIb`(s$XW4o2D;CIsmMUZ1anf{V&^ z!eFoBrKzxPQD8`NW&Lb>PW067yIVPDRa8a>zcn|=NsfP^601lm;&TVpH)V1kcu ziVS})so70Bf+HS$o1d2s@|}+kaO_IQ735zhAv9EdgPBdR8sy%8TKHez^3LiZbkAcw zYh>D=5q0#{3J#hm7!qiKM~5?&piMT9|9&`VRmbc12%4+R&`77B{Hv8p7BBx?dbt*$ z_2(|lLKCuG4Fd)fFXYYv&4wunPT)~G_bi46a@x@{6YR=n1dOIR(vQ-k4T>kNR3Y8O z8^FcRN=WFloxD`x^-(89sSrzr&?KrG0;9?L%`1A8|CO@-_2WNT_O zVDS=kjBL`~VsIm))2zI*_E;KK@$0}E3; z{(BZoYUEU`O8nC5@FjWAp$UzXb;>H?dQ)Nxb-$Qn{H*oc| z;pVO_gYNq?{g|Z|F%I&Txl>>XaZ;jiQ9T$?3frHYP z>>3ZO%LbgmJpgA+ofi`l5l6Vje&l~_Fn#RO13^B?=C1bMXotK%%f5D$O^G^ z$u!v?;B3#SI&LY)FG}l!XFy_|Fg|YY^W`F${V*FKPH9UIKmo-J2>VwT>*z9E{yQdr z!NdY8XvpjAMdc~JMjB-lW-^7Vi<;I1EXdZ|ZU0*80Lsamz7n#UtC3OM6cO=`nT4g5 zIyiU-RKxBA(&?%I4m?(ciKFDZiIr7`{Gd~N2}-H6@ITqLVJGCBEXCs1#5>B{iyEXI zz!d^Llai8j@7zf%bg6IC5ul@^(-317a32v5xVcS9j%b$Z=U0FjHz;{IA~W!v#lj$! zz6uxF8f{SRgoV#?L5zQ;?fYPrkDg%}uZ-*K@u{hxN$yK`9|J;f7~j4I&ap_I6N_sS zBeH%9Vy5>&oRseL0ELDsT=&VfUE~Tkp_C+`pcY$22>nK5k)@@jNsQW-P&lm~U3iKJfCkPIX=fTR zHWY0^1d_!Ug;xYDSYa7PVc~uDaO{zMej7WGa3tUgAAvIwNH3=JxCbx<;4gn)+4m|= z>W?hIn)7PtSWU@_n=Fza1nC$K*#LMtlmq#T3{)GQ!7TS!cqIUrPtk2pQw~W1=lRle z$Wyy-Dh_>kuyi`;Ak39%|Apn^7L{+PUYOPvgu$pm@6|lQAv$J$LqFVo^0(X zcQV)jMeROa7{?fd0z6adOK@2x@HJEmH^N#eRn6q3-fT+&xiz5U-|nb0!ZOE*PxpB74TY@2zuD1)A?Cl zk45=bWPvYTgOuB_Br&toAQ_GCFxtI9BX|s?ggY!8CFtaC)&{6-oDnqt3RH4+yuHhE zQK+w=zC4ph1G~G=2i~@H7Y@e=Ry?W*TK6==Ilp=?pkf6)rV*U<^ZQZdd`_?60odOX zXcN_=3>e^4B_=SzC~!93O4|S`8lyx97J93?Lm9h2uSl9g-~@3gDI$BT)_X!AfH_Fg zs6KDoEq-T3bBm`$;?8@)R9TP9PDg~#o_!NV1^MyI4}guY%(204HCYilDC_{n&a;>j ze=aY#qI~jhCZFb0@P*vyXc^ihXUe|x5}+ED^gwjv38IJHJ=85K*hB|3+BI+l0IdjT zdW!_Lbr|e$VijTk#L>bujXH32#@Kx+*9bHFo8dqN;>m&iAP4399DqOU{|i{i{*8}- z!GH*;J_d8$e{aY9i9eSP#3ZMB(DwY)3v@pOy#|W;9lB%FLf-qjuq`kYdEW`@(@=f$ zuW$LsI@yq!`+r8fAWx6_GiV&}!d-fZ1wH#f)-?}?{pg@vmDfrRS;HZ+y+hYbOG%$G)sz&Ds4 z4HHryK74p;ZuGAt|Kxiz-~Y07k99o(R6!1!!hnTg(7`ZSVC2%>)C!QT7@Wy44qzOb zuIK+kSm9q?gJh6wPEvrnoM19#IVSdUWyQD3cP38EqVlFADT4|G3S+K)!zXDG{{EGR z_!NwPjE%LLMUdAMpb!H8XMcqB6jh;Rqk)|k@(HT9CY?-MGK;b9jWbYrowgL9-Ch~c zw*-AH5(_pv3w}ExeM9G`qyEbrYN&fuyhdo_3r65ZhVsAtOU8Sr*z){qA8bpU-F-Ra z$8Nc28JET&PMBrX1rn@CKhbebtYe0a-`cDCytsXjO~}hbU@`uPwT6cZZ2Lo5>J?gT zRF*6BtTI$Be)IO1h8z)H>bJ1LbDQus9iq|U?=Cue`p3MpYX1iDR#oG+Z<;c9W>EL< za#1mmBWZH&W8PmfxN?Z~a`?@4ixTR5Fj;>d=X%yHI^vHAl=|HH`whNNDrOVB6bn1Y z;+&lz)kLuRZ=Ka9Q1AK6O3&r@mCf;^udjtb7ZYS{^8dU^2Dmh5$s|UBJE8fy;y$;( z^l9IVa}NHt|z?o)=A?JKev^;2^sn?bDq-C<~!yEgDPS@^kVe=t|D6yYfa zB84XA=?aPyR%A2iaKEcUqmd33pXm$irxbwr`NDt9F&jY2pLozbhX4Rh#7?qls;reXXKT z#psl?muNuFYOI7GI_s?ATd2ULmqs?@@(7CN1F;2`PrLkhT;pRjS zO*cWryjuIo14CbDCoMXE=*nRyzxXWH_96dxLrd0njX%qh03IH)_S4~gYjR!g;HzuD z7whnG-J!GZQ3?6FLCTjS_-yyx172kL{JWrJJ{XNVL6~|rl!pp zWZ0A{U(fk^E_zL!8#pYmu1-d7=&UJgyJ$`7fgpH+^MFDv9K}gUiR4Q3y&^vzkjI+9 z#*=}&?7&LyY2smmgMCmHcYIc%3>;49EMG{NR31+XGGmoH;R6> z(EPGqx8Mi`-dt zLhAtaxzxD_V^-!pp>sh*Bu?wuF>P~m9$xxWTe1th{h3wmQtp8s1dP?2QSOes;-7~k z52cBbR)W7*H#Ge^`*n`|b)n=`cDZ|1;j>_J)h*4B`VJ8JquqaV#bX4PnPrcBQRcVe z;W{L{s8o&?Vca^Ei#|CQU6>YSm%}OK_JEyq?;=sVB0A{?+uGAxS@l+WKTY6oZRNnx z5xJqG6IW=mM`k1evE}TgOQoMN1NHftm|NL`h$wOGbft#tgzBbmq4zUJ7fa?dMxSxB zn=nc0M|h37h0zfn(}A$y3i8obpPCOJ?)>=iLmim`WQl7-i@s9_db0(~bsJr|Gv))s zDh?IUO@cLiw!==hE2e=|n&IADd2qx0te%`kGym;et;UBL*$$~AS`)iYHa1M5z#9%j z`R=}4UV?iIL_Hv`H2h2aDY*p$YSXEc{~mN&uL0>TJn|lpIh#TDDWt4CoObKo&RtM7 z1}ExI)fj-!pN}*wOdsmF35b6Q|Ydj2G~&k#%m%`nVRe=J8;6JJ6-K*fMgdS|5_4Q@x00^VJlI z{@Spx$iUsHu-!&C#q*ZyAkiu(0x>lS5a;*gDB>9CZ{zhFXmbgn^U$yy0jj?)kihLb zaq6JLkq-uEf8Uy>O0mxqKKN0j1$_R9nmaxWx zd`Y3%Z}aUYXatayB>sAP=l#9%zGGhov=d-=1C;l|NQ#3)aX1G*rH@ zHZ~UmeATOk3e`(2g`3bvoFDuQ!4wC%xD-@b-nrwBrmc6^C}Qwi7#=!+b0$aa_Q53+!&jv<(7?3KF)z%`Zon;Iu$G+{EXZam+NM+azK0w2APp|f!tb5=0s*y?$ zi~dTF)Qz@bP%^AK`EnvJ)iW+X1KG6wqfEA%<#g z3oDQy25MYN>lM$7J(Muk&703I_f%P)8}S(RPpPbM0ps5E(8!m}0SJ=9l48+Zl)yOx z7SgWbV{U?IA!}monwb8;Ju11vdn9B%@&iev2Fc}K*P)Z@$!<9e6uk$0iQda;`V0An z>$>noO9a02(S|h(?nSoeZT$X8d4XF|a%ehAI>mBs^I0j#kutWR+TrV|RhDw$17di} zc#Zg=ey7MY!q6=-cgU2-j}Jga;x4Jn%!ux_lG}nd(E5@cIC=A$^v~F!45psA|2TP~}JA^u}5Fr(#cB`q=rs zN1v3{-k`9l24sARWLb|Cr<4p-u2c7P^o!kD9|gM$JqP}{md69k*oyorN1|2!<;2SD z&g1Qz23Jp!RQ5g`)CLK3JZPxs>$YPA{l1lv8Lv58J)hKc#Mz8ak7YHim8!r{u16OF8F`?q)b4t1?c55B z9*Nx1aD4OBZBx_fW|Gum`IT;-aU`7hl`Q3S z*OgZxv9n*#Fses$r0?ELxnZ}$-%IN^T<{Qs|y zAXq$xD!E4a`g~?(_S(i`r2cXz6evZ`oq%%7(n5nwWvJAZEbk-C3&J4_>|;gf4wAEY zIB<%OTh24hDL62&iW>_r0?o@`M;oFE^G={%1eVDj6jdid#=R?rQuiFPu zl)QZR?s2iT15{gG5Hnmy}vckVVefkPo{Kmi6M?l5SpuCCk@f*NvCcvKx$SiLR zn}Dzo4|oYw!Eyj`%dxu5mjl`|&NH%|TzoDHYPdi_ zNvZXRoY3~$TU*zSpA#8dppGLPP6G{9(S9%Y;WPzXyY}3zBloWYh6_rXPFT&2VNy%W zYtXpy2HguXalP)DdPV{1PTZ?%5iv38>^V?@e$fn2MaHbgzo_7?S@eO5`VgSg^ah)! zr${-y=)IzZbLLaVF(?@ygtolf5imL~+NW$i*&nV8gcuJ8FYA%EiR-IkwS=H`4>}>=+=9BvjT?d1aFtpSjLOY@)6@mZ zT4&FkX)rGB`n5Esq(%i2ooqBiMaP62^j9MvStQ>yQciv5S3*AE>4!X2H6ds^J9@kb zKa9S(TiVDU5At;%ivhcScIw8kH6R5bK72e$+a;2~Usg_=SlQXp5Dm)fYc+>>K0<$0 zAaaZ61;*bL;INd_IW%YlNS!J*>6?ocxM&leTs(S{U4|AKB_=nf6ZET7`^4!WMMm%d z+0csj!_?Fi==%82GYI(0|J2C$@5lLHZvNj-w@lL>?Q@PX7Ur+sO5jy6Z7qGxVs)EG F{|ETbE;axF literal 0 HcmV?d00001 diff --git a/images/ScanPerformaceAnalysis.png b/images/ScanPerformaceAnalysis.png new file mode 100644 index 0000000000000000000000000000000000000000..58503acd8a86141cc2d44c4139810e74226d3666 GIT binary patch literal 22870 zcmaI8bzGF&yFQEsq96zsEz(Fx!;k`sbP5iwbaxM-QcBm*AUQPB(gI2iA>AO7L&wnd zuED*}IiKJ8p7)Q9I{RU*=ULCX@B6y0`&#o!NkIz#7U?Z4EG&HKSCT4NSl5)Xu&y}V z#0HebB^k~OpV(OcngxC3QYOQB4d&8l) zW~C+51sI90@pz~>(|W3=)S&GS!im>rn#E~lkfTwn@qDki8=_TZ?Zi^7Q(Nh_t|)pI z?fnh&F4Dfh4rc=>ueJ7i?Q2va`?-$E+3}t)qqtgwO>KEOw^@JYr-_3>pJH!Y+XE8qdiM(GQvJd7?xBEA zoofkH%G(9q;57_Zoch*{Oh;7z;up)VI1c){$yyi1^PSknSM~0DU2|6B<#rIl!M5}B z%?sVu5Hg0P-qct@myPx!o!a?>#*2Drw)YE{g{37ExYPRh5X=*m6)Qx!w&J?|^I;k6 z84T4=cU0SWB6!q$ajNLH_4_$IMKSP+jE>+DYCqhn&~m>h0Pf$d0_g?d$9oXuQJ!G?>qtD8mjF^%RpZ1J7D6Q|(y~e}0((QsYssjrtgH~mKu_N( zdOjzbK*S)-ILMjpeT?wt4679!9V!(&%PZ4>3Ow>Um~!dfIX^owInp23%)Vm(>&x?< zAidLO@|=CtfRy!2L-Jc;*6M!yd~AGDez=*mq2I0h^9-U#p9JEj8$4?VJcQiQuZ+9m zsJ+3)=!P60QwTaoBV*YG3lI9FsH9_0UOlc~z2e;HbvEzPc#=Hk?M_&f z#>>OQGw`g$YJzvyb4BE*Ez%bXLb> zeb2o2*T(QRx3}Avx_UEKd8{W#&4forhS&3F0^ozmf-XH4aR=NED3r+%7PyAF zYITK*4(-AuA~L-lcYZ~DWMFgZyf!W=Y3+@asP}OkyhRi3kQ>JO=BtnQ_si}1c)1{) zGTbaz8KIEeb;ZHmy=KDc0*w%jciO`m(Riv}@T?crml04sQ)Q#);JwfpV@;sTqm2@P zLNcPm!+$#j_KT$>`7B4n*9nKRWMlDgx=GcwN6L)J68LPaDKA#?i#emsQ4omW+js$o z?1_)|tQV!rax)hv-WTjGtKYmfN!cdrhgXXmQ_R3n7T%#=GskCIup_9D6^`Pj*#$|# zNvh|5e4A$m7Yk{RwztTc+^ED-M-&5BU1jJU3dlE4EBtWMH|26edtulD_{zOPyTN1L z4KQ_9kNq`{RkN>ZH57a2nbJ{RROD!vri4%Z4!JC8X=#fbXPzKw`ZC`0*=)Sf%~LO2 zar}}~_Tn1Oo#@*S+4u`B&ax&|+wG-Hu+ah#gD?B*6DAvtTg|sAB|>{_-;}8|_~#%} z+&eA4@6t)~SPXYXGn(fP6+p|n3?K&cHPhkL5^@fBIvUVf*=tEX##UpcbP`ONuX%Hq z$|oDWXOaZ751jUY^ryeLEhRALGUI9D@1$>aFGdX``CvP&7*fz1(bN2fTk=gR zG5+MZPtz@}qT+hHxtGL`og^hC=hsC`MSL00;?ED;L=*Og3!rnZ1;*V8BpopoKq6qj4=Edd1GO`iGch2+nr+e9yRMFpL%lo@lNe~*v&WOF^e*rvLeCf z?<6ai$x2IeaCwd=Ng(GPDTJ$jzUyunMfMe3Gd?-6cA^^GHN?hyq<{58{q?SiUQ>ej z8Q5-!n;gMn*-n}3_??vL0bO2`Goj-$_i;W_?7fDRM3X*jhT~BiuiZqvnS5b=*JH{8 zlBcYy=mn3S6*)C^@n~z8DVLg_{Njcc*VDmV)tvy*Q|cCv6@z_jc8KhPrul75VPq78 zlzqWj=oLOl5>U!l&hR&g_owpgQeAb>*sR+P9l54*B0y1ulj6Qpk4+;yBh|atvG#os z@@iNtZi>*z=GFG$_8eYs$wqCZ>4<~nqGGG$sJYeNKW2ii?us7EmD8#-Z37J@l^<(2Uw^Fo85^&B z(yL#%iFEGFCv=)`zspr?c7yT6q~|z!fJg*i#(hIeOw7lv%WFDbq|GT6R9NwQI}xu{ zGM8dUq2BAm*A#Lj@dXRnjerEGAggh;;e(y}|=;>);{8OaGp!-4f$vFr@!+k&a zOHHl^@f4#I2W1UmBH8i}O8>s7;j-TJWw|(!YESl+_F6CJsfr&i zp6jzM0m%L=*={S=c;tY$q3{|zYvW)x-SB2Owv(Vu2;Ev^0BMo7$u4*z>RU&zYR6Dl zpnEGr-5vW7i!TiOs{+c$-{8qmbRB&@H}kV+zph`wXLWBCQaj-TB3%c>|V+Fdsi+(yGG3)D)d zXfv126H=Q)m!V$kp;(NNe}!&H!C7!uh^{|LACmhqGcyA#cE4*{uUDj9jaOXw{za9; zSgBF>S)CAj=hS|+^LoPRf%ip~8|)a3vQ9OLIW+s;FZym&D@n`hR}B1Yj0_-B%)5=J z9Myc^>a=_W8J}AttY?k}XVxJotoVPxjZRti6Rrnai$uJoOpdq!M&y zkrkCymx!(n_1Pa|8W|Z_f7?C!EhRQ%O&Yn`>LX1 zqtpl~#8%}}R{mlAX>r}ZbzKQfPL@pat1)`@N)hGcD&8wYLqmV3J@(^#jbeC>(`o{n zou+YS@>026_jI>n@$F0;_x;v;&cl8lh4E}QIEz6KSUb_*r!?E9``8=Bqx<*A#ut=& z_xe6z>~cmh3xmXfBUAn4!%ll7He&V^PHdeogY;nv1Wny7*zT^o5(8qM^)w~AC{1HGXlQpM7s z!rW9AtAr+FIaNL>b8wQGXZLzQO7mr_-zR6O5M`nxd;Jo#wclMOr%FD;qR!(VudvYY zUv&l_i{A?efUT*5g8QcO&re>JPNL>qcQLsAKLF-S{Qp?;|AyRP&-_Cc{);C3-xuE! z*VIgKTpjL4mH7HzkB2zrm0L}ypPrr?j+dL+?=H%hu`n@7CvclZzQv6DSn|FH26o%c zwHn=gUfg(IV>w>l!7{RaezJ*N?#t*djbc!g5fj5ABe=XeM(86B0UT6WLr7HS^E@LY zd~v_hYQnH1lFm{HT;W6bdWT%teGs;HFP&1<%d+zT1GNVbihLO4BKCxaX3P~cFLmDh z&b@Sc#T3h)PyKhv$Sf$y1ne`Ju2{}A)H^Q!YVu_kF1{o@)OkMw826U@XrSe$5$M&C z-pP@za)9Bhy)PPY6(LV~Y^Lih8(kb7xj#1PKYN6S-GT(~?FEZuE(RP2u{SI`$NT3x zuaB491&j83{`6?KAK;-}?dno)(_dmDr&|(;g)EsEd(zi=Y8?1o1x3GxAseplNrZml z#OZ)A#H8WZyfDeoEH`a>2}qnz%iiip7KTU|As-B6u64z8Eyi)s(*7_Ylu=NC|N5mw z!fAMu`ZM8cw0Lx5W8>OP<4hN4R(5tLs2WB9AsWZS_b7y1f3bjKo8lV~AlsWFJQfI+ z#P-)}Fq>}OD5_+xElf6%*QySiTDYM=Etlx;XoDE2ksxI%@rg~R=9Q9CxDf0RjAFm= zIhaClm}Nlk>iRe$46`%fOR3(|(Yw|C8@}GIQh-4KU<#6Y@{f; zD}hILW3t)+K2`6b0d_TumzS6EQ7{P?Efdoxd<>^2O#dv3FG&gqk@pAf3Md9GC#Q;v zsw$m;0Q3Q?Caq2V9vu3BoZNE5;!s9fIs)(!@Qw+|FgzawsrSEOcT;(z8QufQecR&S z0a!5wvPid{8IUbw1Mo!Or6nbOqnT7+%ET}my{xuhmN}tP_=1QD844BWR?A{O=cjxP^}&LoMMy&o5bF+SiQwR`J!JMqKS~3^Z`O z+5wSaSPT|^2Lx~%XMN%2_I!~1zJEYM8?~l1kS$;DpWtz@A&rSZ@^nlz#&en60bqeD zorsv&s6SKM{^H!zp%H8^1?rXUxicPRnQu;OtyReeh7I@k#r7odnylVES zuhr=N`}bhZm0xKR^lRTdLVE%|Pq*9X2dzfGJ~toCiFjL|tBOcyHJ52z@#L`@=l&EF z6oo@T`RdJ^xUM9DC=f?PM|oaksAuUOEZee3V!X?kPA3_%ZTm# zxLEVl9N2j91Rje{09Ly#)vSFPv#3|VFX^Uc5ML{ri5+kBZj3i8)is%E@LY5TI)uEB z3S{=z*S8YlYSIMCPK>hR)+TK8F<6TQp)t{6gBP`L!fRj&6q zfXBr>LbRb9hJ?ps3HdzsEmcEJyz3m6da`%SH>d0EHzozbW{VV4MY5&D!YGBw!-jHI z+xoa3H(p9wLUZ-yy1GeA-7*F8K^vD>qtcpsc zz}X9}5(~d!u(K~yD`S%QzBchiHG)orIyvM$ksn3^^kEtJJ!m_Mn>$Ozs>~2M`uHy* zm|n(<#?FZd>Mh~;3m)olMWgkvM)ye4A%qM+{!M?b>1m%hMgHC@)eRN_ldEct0(OzM zPoA?`*SEjCaw#3!N|~e-aEe&5l;n=nL(hNMf4cqIVD9e#qhrX!@(MRc{c>C-YVVs7 zXj;vrkC(zEKQ3O)vRW(Rii{LYO^6KVGCh9ZFx4)d;B1g+xl>~I_qNu`gL`dY!MMuE z<~gRhYnRWtWvno2MEpfJ#rZR5!xNS#PsBT;nP@G@PT7_4)es z>+K*klZWlv2Iem3h-OqLNe%sm%#Gf@Ka2G`P5j30P$>{$Fom03!2W-gTMUZQQ)<-^ zTE1GI>s)YQ~+vw@cw zc}QAXRF^`#k=LxBwgn8=&V;W3W+zCw+5;-G{HvYH6w09CzqOP(34IM>J^2lfnvtLX z4M^aISJh=?J{P)Eq#kO7#Jun8zx97ut9{^X(EfV*>6ApopJoJ`XoHenr>1 zTKLJqX5v<6Bl+%PH*()YN=k|gpt+Atj7=F9zr&WEhiKZWHJ|t;V%<_ve~)EPmr{7In9u)l2(^Zh^8g;oIkIh+B_k{5Md4=(avAT^GbXGPr33`#VBP7jEqu(y zFn8==$xM$TkNMzbCi`OB$0^dd%|S28s^w-Ko$YN0$^E^ac%iPDq{0*<`-w! zu+Pf~wwW1sz&w2^MLLK*hJ%eMB>D)`q@V+uDEnnnya!PynVB4Vh#7K$QBpph^F?}k zy4>n65VT)`P?{{_v->0YDI|nv)z8=W6$pFSEoI1n)7i1p%opgf{FzOkO;WLlYH^FbP;ajq82~Y zL~md5%^{K!{L+=krwD?H0i&q9OuMzAr#1dbtci`+|H-RwBSpa$y_g)J7ppHhWWnsP zJVR);eHIJ|``PlD{@EBPa!EFS|JKLwyv(euc-<4AV26MC@|c$t3yV&>J;j1v3?+B( z^0$AW#w+p%Jn>aDwetQ=mn^j)UDD`Z!Sadh&~2^SJsY^Z$w(>Wp<{MO`hq*iIb2m0 z1D|@tppL?gB7mmwAEv{xwt zT8J{)ImFVKQ5nImlQQl}Vq&jHM*_U81Nfy8sWfA9L}>tH{fF{;J#Tf;BKjwRrO$m& zGtCv9Zwp#sOQ@SsZasi5n^>l5lY@1*331s-!c~<)9<610xf_{w9${fI#$Saeaa}S0 zjfn;@+35O2mEtsPSKh(naBDU@-j|MpL-}R!qo$WvvA+10#BRShruq_0@K`d&((`0L zCNS_O7ZQzux8e5yv4)$~?gG%5u=+9!fk1+{N=HlU8-k4`h8vl*^Q$k!X1z|;5t9h` zJ@yOmD1}CoRsrK;KUyp6B}ZN+9|tS9q9tjRpG(CJ2)H|eMPHq4%yb~%e}5=77b)-4 z|D1$slIe9`blY3jUJE|!Nss^^=)t!A0LA`3sS98gK-WWE?WP@MAXV5Qj9^}K268@{ zyj-oZY=vAG~UPvRd`e+*}rrxa}7@?-OUcA8d5n+ywLQ z7jQOcF#O=}JbX9(>21rGUw%2Kt_0%IVStpFBrus8`0-7kX#ZsVrhMwfpej(v;j9Zt z_D07719mD}$@^blhLCmue3)rAkeyww2ns{l8-<4;{f#;$J2Oj==ipMR8_3&|ph+I5@?AfW05Ks*z05#5BTB2>_E|f@&^EH2ECWg5`&;eH(8WXRfq+s zmiyk2#zMu2ZWQ2&31aZE6QcB6xPWH`^^S{TTF*bFCl}PDNc7%BB?Cxv7eX9EMN>=N zIpA4e7#bKS~@#ZDcd^oQKPxa_3>V(HO3B3f$~| zT7lbK{a}r(w zjKZ7Jy@~zy^{4@aAgj%AK|BoIC$$4uD&CShYJGL2m^l5`M<6-0170M0TU^Xc)+5w_c5-~uapIP&R zZ9ZQtT@IgQF;Q8VB;uJM40i43tbbBR&SVAX={P8;*;&Lc=YcFk;$IoA1`3XK>@_T( zbXI|dw2a`*k4w%B_QIZPOIt; zTE#*42?@U=9%1>sqt^%CgD)=+Ad;9`->Cz(65OWoOdXS;*uG8Fy4WD{X&&+c`GnJH zMGYZVVc0<=@)N8L4TzA}m+8iE=IrDEQ)2DsTJhTqZa-iNZ*PCadk1VHU-NEipT z`RjUH%o^RZ7neOOH|(>XZ;#-#p5!u7?rNFo=P%K zi}pu=dJzX@#{Vw?Pw`()AAFURdbW>VY-rs{a{6LShB)2%#@f4yd!<)^f)Yq071|C8 z1(%LuiyDxTM)kp+bAUTv=(R=!6I3e{7w3(r75;B+Z~VJFS=?A+C29^xdy|@#K(}NO z4j_zaD=(3h-rA_$OrR%%gj8bqw^zt{m9haYcbv=6J(P9dJmU)L6@Snu`&jU z4lTQ1#d@N8YaNZi4|0#%E>@Mo2$t_vl3)#McLLH#)X$)Y zg2K+Hhjt*{gp4R_`x- zhlL=^ych=RB4B*0UsE|7-myTQX{I9<&$cpQM>~-zDI0qpj`-rBEpsVu>Feu2( zm&2>I!)ll00G0vFNv>lC6m-L#`3{9u)WH|zlP6E)!hucUJFq+$jedQb-3kP;FFBy- z6=nt(zZvjMr20Q3N`)aM0bFGni9yh$;u0zfVJjG8z_)Qfho2n*xF%Bnkh&Ac^BBz` zSOv&vKd5-BOSwj#`yqHBXT1TuD9GG8Be*z@i%a?FnUERCR2VG*#ex6YJvXVqHwr*B zlbZ&)xw+X=?}6BWJSs0xM4qgLt!~$eWTW#vD?9d1N;f7XsHmCAFcKX|nQ#n18dMoB z(n*|A1GD7#tyRAyq1p*a;oBU4^KPU_ryoEt7QdY7dJkE-;9bky=@ELH246-VAcf%G zx#R4v4Jdp(NDuUdakjfv4!gRQS_H*VAN@C@~_>v{}rj+T6^ z2Q%GDx=e*>!36F80B||5<~{$2Eg6%&UkpMgJmt1n`6B2+so4O1wZ}oxWQ~*TwCfx` z3CFwZ)SVij+C~D%GRP#Em6at8XvE)9oVeM>uIK1FIfF+u{OANrjh~-4KV`84Acqu^q@^4i|rZ2K9flGmr`-0dgxY=XmjNz}6KuYk~QN>u5F< zaWK3DrQzbUnT`f2fGEB653rWNuUx}^1r*3EkV3LW`p3t9{shI=2(+8**T;E?b1JQ; zm_Z0|AwPWlXw;Yf!WdgYmQ17I&D*OF!K>s4bZNp{6Y$X;*R>wDQ9N zdHP8%x>7O^ChMYFj`7l|vQDl(14?Y7su}v5Y9X|&RY(yiNbEfd)n>-WC>1pwCQ;k zNLoHWbQH3{>Tyb<)*|}HQbCCjfb#%=%yBKV>=mZOREZwV6HBQE0KnVchj4 zrdMc0YHmJUU>3?=aXZDILU?~z!4-rE7b50OZ&mD!nUwI!iHO(f=-j~qs@@Y<`FMBP zj7_ufHQ{GR7ZG8;SH%yrq@jrI^6XM@J3m(>il?vvDP$FPM~BDix=T;Hud=oURQ}r* zsBzb#0&#a_19h#7_jBKs?mSdF6pvPC$pvLC7j%&f;(N~Dls$)#q&0N_!TKSz(lWlf zJp72}`ACrhmr5S!C1IKbVVkQVRR{a~lK{zf07+a}ftxVB?$=j+o;Qp=pzZR`T-J$M zt@l&EyK=Hy*;Tl(q*?z+W!Tt#3X1N_&LIt%fyF!rg0mq;REtR}J@l*9=UK))YMEZq z{xLe?MzfW{+)yK8dO2GNI}jU`LydwSvh&vCpif~HwzJ|~5ot}GoPd50xQd(jm(M1u z%N&uW*VR`cmMHmczjJE^g@v*IveGn^7<{M)d%MELBmdBZ^mBo45&BOv#yM8CCy}RG z@Zjtz)&dL*YhDyUNfH2XGpWsibxS2zg>L)czy)&lqILo$pn$Qf$&aHO=;-NNg${r- zhVvkdLeO}_!nAZ*K>ax3cpR_;IM}X-Yu+?%gRf2CD9VUk!L-3t_h5HtC1X8hcNOej^zYBp;(K&7!}HlFz?fWq26Fo2!2f$l<% zgzBcUZ_X2aFzaGc4&Vu853gW}HI#K8W}mS|bBchzGq)-7C-dvPxbq$N2v$~$0c(nV z;(kyQcCKFL`u2YK^o-X@;o5iwviuBSxvGKtaex)~)S(6RZNz2YE~imkK4{ zexwdmH9Br?HGe#k$T`9xU~L18gb6*HGwPoMh^ma7TyTMQb-&|+2L{}NX)qgo_ibQU z*b*Oj&GWZLI2*!6eU(}Ro=BN|aolN&>A8clE3Z;=07WMkl!Z`khxXRngrzH$w+t$ne+t!2JO&>Ch zW!G~)Ng`%Yz{p#f?|y#Hp4bQQf~#Nz)Rv?D37hV9kO1OOWK6oS?~t1^4?9LKfI8!w z^CwarY?(eH7l9Z~k@?Txmyw)2$1*8lNt6<`MTobCKR@3mdh{r6jxY;ji)s$7U0Mzv zZKLwG6rTbmA`@UDlHo+)r?`%TlQ?nwdNUs^gS57Gl920`SaWl;AxOKvjDk~JV9On) zt(L3)(W6NM6P@~a9L>T=8veITzd1O{%0Mk+76^eIejQl?-a^rqgDi}}**KwKWUR_2 zwYo6Il=O?5xcCQdVAI6tis3VGzc$&jJE`$lFG#5DWlYj{3=T=tg;;luTJzlZRwCi> zMf4kU^E|bD^=`z$FMAQz)qx$6letfyK9zzjMI4&qu^~`36WgO=X7(27(o9DK2d~G? zeo|@-DdUw*h(8vU3Z7xu<}a>^|B6MktyndihAuHQ=YrS4D)z*LNy{ zK$a9F9GB<^{TKO7FYr_zK>P-P>*wd8Ka-T1=t_$T2R-pYAic@|767tTJjv&fkQk8X zJAhjmc1Ky0GKaNyJf7}|Hmz$%8SrjPe#K4c}O+z z)=reE1J+Svbu&-_DS@H$+KOj#VX3JT9GE-IeoogoSqxMs65cQXl!$(NLShi0#Yu(k zt5`k-L5iQxQ_#^U@3f2*dAMAG)Q> zjKtitc16Y{=MJTisqvTcpN6ik-_UDgQ6bl{e8SQdX!}A4fLclGm-AGksC~4=;Ad}d zulA7}c)g3DM1%vV{@0>Z>VugEgUhKXLS(@!z3e7@f2i5I#3$m<*^u-(j0o3;mS`x- z>Iz1Z7PI5M1i{sJQ{bX+TR-|WMp5xTnaCo13e-eje}9>bmm!&e8!nEg!!t8!QU3#E zRs*ksVz0viC8!3mfaS`99Y;q^O+9e%Yh(y{lmi+-yQCOJ=zv|fjxmIckGCr7fA=A@8smNOSn>uowhZX*0UK_dm|-6hb^XST zcq1}i%W$xPPtZaxn;f7QM(nCytjlW98r*>vMy~=VMc4YljJF=MTFzr2&2t`%6hIA9 z5ZUsH^mcZ3K=QhVd;4}2$b*o28(Z72s-~AFnR%kl!%`HQG(vtz&epwriy@MK=SKB1 zpL!49k)8I+K3u3Ks?(Wyp1PBg6Wx#nNJnTqb@FVkR#nmET0lH99T!2vPBXrtjUg5G zss*~N|;*-7^+qK`}^lnh*aL^vaOJilGKnNS*+4%l%0X%aY>ds(+McH8ri)(B?ZqWY_te3q#}su6D%)ifko z`I-Y?8892}C92m2ZTux#u$qZ@he3}>E?Y6;B&o%-Ol~l`>Ot#oH2gZ2#EhS)VB#$|4Fz-V zyI?|4$?~XISe_(RXXiykON+c~zk)~6thC_z-4rp<9R>)8h#90oj3~T-PgfhP*rKXd z+*rXxrLM?UiD!FD(Q*75aGrspBl8ubw@$2wX9Tm)N{V+(5>{?yChrqri?Bb`3UOX3 zcRBby0@DY2AmCi6H6vKxdpcWxx>;XU*P7`nI;+kr;%d0Ze9x$x$^!xHD8w|IKyQeg zLLPn95U>>9u&Tcu__sB&;I<>H;oC8c+saR7L(Ye@>!buB>!8~dxwJQ;7Yq1U{P#?W zlMUei7-zYC=MEh3<@kIdiVr2 zO909sft`zdb=k@j59seNk%thc!}7z}?Xl)4Gu7#Lk`3idjd3u=?6nKc6zzuI>kJmB zn{`(GHs|JU(Xb!_YT$%V;-*CjC}DW@6V_s*=%ch{GuK(=w{Icjm zVAqjbIIhy{S#!`-uM3BLUM9}jaGzN(Wy+u9?ucTM(UjHA>N(r;;PJhzXq}P?CjK+n ztTjqtAAbzpGhx5>{ha6k3OYwGegPd|{^vtI?y8C;McG3Z5Ppax0xYFJ-UST=EPU0fGpe7eGhtvPYc_isuPOIz8C7aLtQG0L#wFtTLg}xGtyTM; zhSY!Xu-Q8LR}R|+1g)nl)ne>w!0S2**!MRx`nS?@a?y8bUVe=Ly*+>U_8x3Of17!( ztQkb3T;k4Qk*>JKUNWy2GWX9MBc(4Nn2Ad}s zxl`*}nPGrPnRcS0>1{`_ov(V3=-D3ay*R*;ZeX-`{YE3C<8CIFQo&U9VW9X!DE|$F zJ=aHh+s~-e0*s^0ZR?CqME_?X!Y5;Cz~a*AqVJvOTo+DaOmFumAXO zGWlGyu*!CSic=(Z%dzC=S8W?r>l+ABe{$rnNZ9gsYI=UpiK1I&3P(S@6tr~j=(R*x zT*Gw<11qcvBnc)^E?n{)F;*Uc9q!(95vVYQZ__0`o9)m+)E`D5?I+C?ix<$=T?Z;w z1JX;s(Hm0z4x{{IeJdma8cQ>APRv-w^vqQL#h~}asrNw%V{WUFcq(ku2;>+{()4LI z*H3MEksr)1B12mjk=c7(jE9Ne1zI7uTMCVDDrX#r%RUg7DbOV%T8Y#rb_;y}>a}vg z0|xGmz{~h|ly==#7t`k07f*y1oD=O8Sv$N-{&1n58KtnS_xE3CFizGdkRAVjz-|_> zx?xPT1JO@-dDS6KE3`nIz3j8z;{{C)f=<8t$6L`Q@<;bn?PHR+=GQe9*xBCLKvtnm zpP&q`w^a6n)>o`Ne0F#wb3xBo7K5Gsm0x8i_h~UX{|UAJ?}%odRu}h>SM5|&%0ponvqq<`c+*NoMNGTut+Gwr7tX8xxI4pmJP`a!7NJomnw1jvQVs)3z{UN;p9;IF#~Q;H57 z6BoATbLre-z8dW8ABWA`Hu8r4&YY`#?-hBfe3R)0Q6dL(QPG?mf%2zJAMkA{vnzbRfWA zZ6RIMgxbsD7bv%i(%#r6FH^V#q>opDHv9mF$@fN8^3_AoULO84&gDRKFytv20Y$9E z01nqbPh)S~ADD4Nqcs+$JilctouVyUa6D+%t14^%l;7%8{gVzsiFl{0 zm!-MdVDtTIC}%?sXl?Qc2_JuR@#kHLt+{@D4}?v1t%X3vr^A;Y{uCiAR5`jJ)HBc# ze|tU__io1eje#%EKPFk9xUVznjqEOhrpM)OvRrtiMKbfbe-#X8WU6|;gO(FqcdLP` zwrmy4+t#>oML-$uxdwIy>%#betr(cPldYE!q&()A@1u0CyQFQW7_*}(E!UsoUQ2bk zrMTTvSH&~}pWpbPIuydNkQ}!ue+@So`feEa1)b#BNJj`XtL|6?SGhg%zZ`Qw5&(YO zC_GG`awRB(Zx?~V47R($R-N8A6;sBeJ!;@l&LBFX`k{!!=Y`3}$!v||KH%2I2^R_n zd(S5)s%(%xSFUA>``z046U2^@6($Z|0oy#Fbus`PW_e$n+b{J{Aof7k%~H;I8A`%F zve`6ofZC|`W&GxJu%QFepSHh`BWN%KJG1_^D_Eazu+EX4bU71$QCA;}HkoCzLdn`! zNC_?}cyNh2PGnD-OK-68;|n_WU4NRzCCfLR9R79~HD3Na(hK6B9CVdHIX;B$z}CFNWYrw&z>o%sHk*RYmoYv6735K#}fMC{kL@w6c2iU6W?NP z%kM~4Apxu65b>1m=$-vW!{JXWdZDq2Xg%}^hfsAT(}5!^Lw;Gx{Uo`qUCvcB5aRsZL6=aF}_igr@eu~8r^L(TNPU8bx(cd7GxdSOfk#)+4Bi0`WhJ*MhRykLF zBT3U87ngvgI$op^w7?X~bjDG=0|${qgfT;@LEyQl%|Tl?yQ>(q-Z2oDI1fDul$y_^ zK$wJPg;ejjhba_v5~i>@B`{X;o)*eiBP;}wUKp4cn@SPs+!XfY7|F&*X955ItUTz;u>%{Qm( zs=74}o46VWkYUM;E`mTl^_xmu;5*~L{aS>~2v)f@KbU zTShE!Wu!Qj9Gooa5;OBO|A)V0?wkm&c(|Qj6XaE@Qq*ev)_EylE4uRS=O2wbc@Bv? z-*4;;@oiHUFJ!WI=zNT-IYB8HUk)e}So6=>5;;UAz~}#TkH37abv#!XMjl48%XIFX zE6k$aA(icCyXi{bd29Q_=Zi+{O9!BcA2DU~6V9)bo1=OlS(GQi6xT0v`>j}L4DX3< z>w$!;jPA4VxdYRF>KR5O=z}cX_E>AJNU5JQ&L=96RYxaF7sPM0RV}mqQ%hWswK0kH zi|z66kSLLuZQ&YHhALxx`&){L@z5wf6PpMmC*;RPJN&3f7aT7_CK zDeS0Hdc2-|8o(BG8V|DOl7#FRUHmo66apM)@<*p3RfemXM?1>j<;EsFKf})Z4G~-1#)mQ_WqfP{zcXi- z)kjtSU6A27bWZf-XN19yU0%GWZO{zSLZtn(@vT)CBb}cIU8C9_X=^rT+2m#vjEG~o zh;LV-7Ip$Th-NIlkZ8KQvGnOF_4r>#E}k+|7t z1)OxQ8hL6=VJ(^|Bmde|t7UunN@Z4f5N~udvtzDe|vN|jY+nVW$naY}W4i+{t z?)-E-*L^4()n}jQ=96Sb8lJH=XXWdhyv{wn$)~s&rb#RNvXMcHzKg?PwZ>}aXt*%m znn1BkL8GniiHqo2AOT+ZN3jK@L3JUJj4|c=cVglU0@B=<@I1gseq_D0@Q4ViffD8t z_m1T_UFO%M`DF&vv3b|vM`gA4Y;Etgf<)~+%9M*i`<-hZDxG z7FTC)8z=kxAxYnv6UGF^J>HIo6idg;-@ekSHISSeR8KtgdT&f_&fmwe8$~s{WP7k} zyyjt4y0)_KU$B=t$8q{bqIPEj!1rs{+ZVDrcej39v4@g z9+asFOK)=+d??vMbTHUPkMdOF5gB{iOq_M=@FxeO)l3(ol^rWi{U&Fwgu441yr-^Y z`}%`iD+M%EMR?#}O&eEWjRNqj%x(d#`0dY^pfAcMnaEb2SGlXMK?tnvFa=_U9UAh@JV)MY{ zrU*o3SfPW^tLXT^;re3=4ak0H*_K1;zTu~$yQZxrDz4c=F>j&k;KpXNVM#haA znn!e5qAY5)sx3<6+aDD^LK&hZ&&g8SVz3;SZE5+($Td%$SP%D9 z%(q`OGy(;fkZrbqHiw1sm3`bh;_%> zxou=CV{SLkrhP-jMj(XsLO_rA-2J_;x~pO{a|6TE)bn3`%^odU9E-;dc z^9lxED4=+-w+|?sr>2I45}d`;6r9DuISC-? zIi3J#>QvO%ug`Kpj1>7YCcBjyr-ObP#<~jdPVjItn`ZedAce{$FMq!ORIZy*$@E&= zQUELw-%g`*uMb;VyR_f!kW}Wh?Sh{Zmo9c3pS?0|US{bz(Tu|v18tPZD>i4?l$~{` zf#V}KJ5Tz~T#=}G*9kF!eoZn4L-K$``3TPn@0%R2XJWp}Y#KdUO?B^Onp}RfM<81Y z!>6&C>-N6*em8Wna%|UtpLE|3_vKTOz^HCJKBI}BiJ|&~Pccz+#=QIr{;`i;oiY+T zO*C{P@lMAN3Y_knUfx-2uHMxoF9asvNSFjnhRlbFce%-JVEaVgO?Q0osu%j7-JCp^ zZcdY@lJarSnovnVS$%moJ)8@m*r?s5Ucn7;*6KBAhjjaQ5L~!S``2#&&JQblK=@=t zX1tY2fu^OJ3Q977E&C@^qif5Emu6rr^eS@`^V(d$>@~Shny2|^NzI>X?W|teNGDB7 z5zQq0=4QJSPiuF^0Ks2B+#e!CF3h6|X!9mcO3ThT!)P*BWE}9RUFMQ!{O8T}JJ!?6 zBvieLpV)|0Z`eF<=bbM4CVe&WWxEH(J~+?TM8-ge{>Zg6-C9<<`Yw{@QGLV7Q@4!( zrHna(=K*Cr7U;FQXIu%rx-=3vcQ_38^ONZ^b!fP5@&S}0TH36P`N|in-E|}|Us~7f zn7N2NwS@ZLpvLJdfjuno@*45>6U~dIx%C13zF6J!jhgqbTCf$ztDH z-k!Vi75Vttyxs5RX~BW-vu4ZHe4mJG1PEf4{#etpr_!2{(O>XC5eYA^Po$XZO427* zFR)t+xQ1enwTkJTnXxKbB|aWE2@OiAA=`|Z_|5TSxm_kF4t+vE`$f(+C2ZJh!i(Ih zQYOELENc&$I8Wy?7lYrQtrzD={#6%3(8*tehaY7p~vf}_H$4_ouV+t|qFMnwNEFq5Lci{tLyreM|?E1cO#H{EU z9ZIC?qd~cQRX(@pX3P_Tgzn9Vx$V>D6cvMYi(v6*E7(pkOBxM^FIThhGp8cQ`>ow{ zQ1x|@B-di$@Vl~JbC0fAAZsZyMl_;0v@B{V=A8LVo;`w+@7^Jp|9%yGjSPjRhdDSK zDV%kwk=6s=938j4(g{ktbu{KsdGH^hah$VqNeT(jL3^SAynep_Ums^4*3_AW@jA5+ zmZ1udEIJ?v7B@f?LCTWQ3Jws8v7&;qL@l6zfD$M~HpPkxEu%niB@r+zvdU^85NItR zTTl#JASlWbLWq!%7($XcH-XQqF0pL@AC-~GPtoO{mieHjRGg24KS*57{pf0rLm zygxjGR0mCc>t6`FuvqL(05UKxE~|7wNB{z%mJN*-)wPxYRn|~nmifhanm2=s$zM2s zDbrfyZkS3a6S;zFoC4VM0$`tRU>-8k6wfm43rhxdUB}K$zv>t^jc|~j5WfxZW$Nj3 zK0bQ7mgE?goGrRr7mz69;*=Vi(qmT8oKFYp-wObv9(m8EZQHg5#tw5Pbe0ymr)uf) z{Y~;EZXlVu1eoUrsXOAXh0J7Tv8Oe??Yuh;IHSON#wbcY<)FA7Px`b>+RBpm&s9Kh z-7PJJ9S{KpTrrpqp;e=-EKt0AC7;D)RnE%qa=G&M4!KyAXbZnA;yY-O`2+Jh=^y|v zuBW=;v{#QmnFk&F4hYpzQX2Ag4%~<_xFhMEiGSOoGFfxoffkw=F70Mnpj`ZFf{8Y( z`gVo%n{wEVc2c`F(;^-`Wb&naM4~$O4eDhRekzS76R|e8Kc0auVjq67X3v?yjj(s` z-UQ?WU~4tsim4`zS)@5{5kW+q(+!t`57AJ>{P;e^Y~bjrmZZlVvBhs%713I>G2hC8DSby~P*cxS{?cv+q722;l8DI(!}>y4q= zir~SD9BQe~rhfWqYMx>%*)}TWiz8ou7`je~#_peYKsU|(+B&J77$o)0aZ#w9=YjL+ zn!RWY++!wTNFtSX4&V}bLLZr%7>ZKHDAcIhXq!2GxsoCEAq$?nnLmZmmTX|Gh*nO3 z2oP0QPV+}Su&9Z?kn>Bv-E*f_vFnQK0&HZ~11x#_%!fC>&ko=bU_h}sMkC&brg}&V5KRvrrYR7-9zYbw2KS~o* zdjh?xR7F!HfbhDGdrf&yH#MAnl?2b~wg^JcSAtV&>j}d!l46KU{*r}>&C}B@6(sse z+?J|=Nb!FR=z?-Uvp)`~2Fk{lk=IMdn8=)QodjW%d`o$vdtb)=(!}1HCtqXFGN0!_ zxx4+=uI~r0N@Cvff~2vmaCX)ZK-CA3hT>N$$+Bz*XlfeHsx&X%oP=J3&0I*vg~Gx@ zH3sui5hB%z&zssiO|^1NR+|qZbDR2;RYXU(iU|G6N3J8iEoXT-&fpoSEztRn&OEWS zbf=77JS0nWYD zW-M;(_TBxdh{~t&V2QAZS=iuP9UUR{^52)Yl}TsZ`-7iy1LRM7=noTBDNhAjV9F^x-ah~%6M z<3hvXm@xp-EKZc=0V+x5br2bQ$MWgDWl{#i%XKPr%-`7Q@|7!{6(@eJ6#g+9-5nM< zMfYN@LuG@aYT^A(^EEamY^ahF^!+SGp{l)peULntGGPy22#6)&-GK8OR8cp46^<3h z1i_`&3s>rR{7^c6caMXw zx#v8<3{m4GFrTC2O{?O=%oLqCc{+u$uCf+~Lcy+#cXqrET;8pU*PIb;DgIBp;$5Qb z75Bs1h`1qN?C8Ta2%iWMerCsT;b>w>2D4_?sr+zGDX}Kg1$(_evFpaijnK^C+1mS2 zN3(6Cx}LVL(YHUIIQ6=vHj8~aC_8JM^Fu#A+ujOa;o#X>XZdn^He+>TC^3@2n!*@F zi}hy(G>Cyw?kqC53sJIf)X>mCH|E!kF=rE!U&c3__s$wk#8>I&(&EkE&nzU$eT+)T zAL_Y(VRLy|F~-hrnt9Q9?h^%$wu>RqGjjaWquoArud5;3#VDZTK!>i!?S1~6AE@8H z@?-kZcD+hc*oqO98mme zHCg+&s~gxOcTtex)2jefxcQ(qR@{&xW=oLi<_u5r;Y{!v3EY(+SqQiTn7|r+SOFL4 zeE5Gw;9Wrdhn7PAIcOVQ0HrCZm2jEGHv#zf{AtYw+e`pACVo|cL4C>BMaRX2MN5So zkJY+3dV*Px?A`pPsOetAP&UoR>dfFy9=oyOml((cI6$EUff~680I4G21~?$6lAMCb zOak{q?=CIFP^jY)(Z7OaL?4uU1@j=j@+kLfJP3T8L^fSaQg&LvFs0T&b zxXb*o`ru&2|A~9RuiQXJ`tgJRPzm$``tO9)S3Df-r82fIsg_VS!WNXXw0! ztTrFGWK9Sx1_h8s6o87Qz#P}KY_S?(6bx)xjMv~F^hSNHk(Vzh18)^W(VqkS4iOCq za_B-mLU$HG=0Nc%*iV7TH>t{mu0smf=oVDu%|?Ne1CteNobES#(1R7h!*BqM;DCDo z0_YY&Nfk1HSTH_xwizLUvB&W+TpU;pK$JNG`4Ul~hRl>@Hf+$(q3e(`4;Pyp3NY%5 zoO!WLh1k(*xZMjX1YO?*0&-CV$%jPx9S>MM96HSkJ8`0L1OW0B^|b>|#NxQQ!kEhi zSaLqJ4kCA_DHdz*ns@Mc`sr78fY+nbRGI0< zZV}Qk&>$X!u@c1Q&^yZJz%@e|)tn=GuZ3>~Y?fnxC#u#pQB7zHqH@n&py6n|^ zwWg-d&Grw#G+Wls2SVhHfTFII+sKFBdoUQOuq%w7Y*v3H@GFU*fkAkbzr7lQrB|C~ zuBtXaR#oHqxSz(YTvaVQ>Aw3ZGl#m?inm^KH|>~`5d4&jW7Ek`uM9@eG_T9e9ub$I z0S0_GG6|!dVsRx_#~zb7K6`a literal 0 HcmV?d00001 diff --git a/stream_compaction/radixsort.cu b/stream_compaction/radixsort.cu new file mode 100644 index 0000000..7f7aad7 --- /dev/null +++ b/stream_compaction/radixsort.cu @@ -0,0 +1,18 @@ +#include +#include +#include "common.h" +#include "efficient.h" + +namespace StreamCompaction { + namespace RadixSort { + + // stream compaction on 1s bits + + // stream compaction on 0s bits + + // puts 0s in the front and + int sort(int n, int *odata, const int *idata) { + return -1; + } + } +} \ No newline at end of file diff --git a/stream_compaction/radixsort.h b/stream_compaction/radixsort.h new file mode 100644 index 0000000..5dadbf9 --- /dev/null +++ b/stream_compaction/radixsort.h @@ -0,0 +1,7 @@ +#pragma once + +namespace StreamCompaction { + namespace RadixSort { + int sort(int n, int *odata, const int *idata); + } +} From df0c98dba75aaace5b7f94773c3ca6c1e60f2120 Mon Sep 17 00:00:00 2001 From: Trung Le Date: Sun, 25 Sep 2016 22:50:15 -0400 Subject: [PATCH 11/15] Update README --- README.md | 134 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 126 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index b71c458..3d5cbfd 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,131 @@ -CUDA Stream Compaction -====================== +**University of Pennsylvania, CIS 565: GPU Programming and Architecture, +Project 2 - Strean Compaction** -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +* Trung Le +* Windows 10 Home, i7-4790 CPU @ 3.60GHz 12GB, GTX 980 Ti (Person desktop) -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +### Stream compaction -### (TODO: Your README) +**---- General information for CUDA device ----** +- Device name: GeForce GTX 980 Ti +- Compute capability: 5.2 +- Compute mode: Default +- Clock rate: 1076000 +- Integrated: 0 +- Device copy overlap: Enabled +- Kernel execution timeout: Enabled + +**---- Memory information for CUDA device ----** -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +- Total global memory: 6442450944 +- Total constant memory: 65536 +- Multiprocessor count: 22 +- Shared memory per multiprocessor: 98304 +- Registers per multiprocessor: 65536 +- Max threads per multiprocessor: 2048 +- Max grid dimensions: [2147483647, 65535, 65535] +- Max threads per block: 1024 +- Max registers per block: 65536 +- Max thread dimensions: [1024, 1024, 64] +- Threads per block: 512 +# Analysis + +Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. + +(You shouldn't compare unoptimized implementations to each other!) +Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). + +For timing GPU, I wrapped cuda events between kernel launches and for timing CPU, I used the C++11 std::chrono API. Each configuration is run 1000 times, then taken the average as displayed below: + +![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png "Logo Title Text 1") + +As we can see, the CPU version is outperformed by the rest. Thrust is clearly a winner here (probably due to the fact that it was implemented properly). It occurs to me that the 'efficient' version is in fact a bit slower than the naive but is still faster than the CPU version. There are a couple reasons for this: +- We're not taking advantage of shared memory inside each block to store the partial sum results. +- Each level of upsweep/downsweep currently launches a new kernel. It would be ideal to use the same kernel and compute the next level there without having to transfer the control back to the CPU. +- At deeper level in the upsweep/downsweep calls, there are a lot of idle threads not doing work. This is wasting a lot of GPU cycles. +- In the stream compaction phase, in order to find the number of remaining elements after compaction, I launched a new kernel to search for the maximum value in the prefix-sum array that is used to index into the output array. This could be a potential bottle neck but I haven't tested a different version to compare. +- There are quite a bit of memory transfering between GPU & CPU, which initially slowed the application down alot. So I rewrote my scan and compaction functions to minimize this memory transfer. + +When testing with different block sizes, I found it pretty interesting that at size 128, it seems to be the most optimal. So I decided to use this block size for the rest of profiling + +![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png "Logo Title Text 1") + +For more details on the data collected, see [link](https://docs.google.com/spreadsheets/d/1mtohoQ4BtD_RamWI2KeV-HhkSYDMmendWos7sQgdVR8/edit?usp=sharing). + +To guess at what might be happening inside the Thrust implementation (e.g. allocation, memory copy), take a look at the Nsight timeline for its execution. Your analysis here doesn't have to be detailed, since you aren't even looking at the code for the implementation. +Write a brief explanation of the phenomena you see here. + +# Test output + +``` +==== PROFILING ON ==== +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 35 0 ] +==== cpu scan, power-of-two ==== +Runtime: 0.1365 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] +==== cpu scan, non-power-of-two ==== +Runtime: 0.1402 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed +==== naive scan, power-of-two ==== +Runtime: 0.0925244 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== naive scan, non-power-of-two ==== +Runtime: 0.0927348 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== +Runtime: 1.72386 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== work-efficient scan, non-power-of-two ==== +Runtime: 1.79924 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed +==== thrust scan, power-of-two ==== +Runtime: 0.0006529 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604374 1604409 ] + passed +==== thrust scan, non-power-of-two ==== +Runtime: 0.0006317 ms + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 1604305 1604316 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== +Runtime: 0.1463 ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== +Runtime: 0.1484 ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 1 ] + passed +==== cpu compact with scan ==== +Runtime: 0.47 ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== +Runtime: 2.01726 ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 1 1 ] + passed +==== work-efficient compact, non-power-of-two ==== +Runtime: 2.01408 ms + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 1 ] + passed +``` + +## Note +### Modified test +I added a #define PROFILE and #define PROFILE_ITERATIONS flags in a new header file "profilingcommon.h". When this is on, running main() will also iterate through each function call PROFILE_ITERATIONS number of times, then measure the execution time and average it for profiling analysis. + +### Modified CMakeList.txt +- Added "ProfilingCommon.h" +- Changed to -arch=sm_52 \ No newline at end of file From 1797595656c3f33fc7603cc11e0c88b00579b916 Mon Sep 17 00:00:00 2001 From: Trung Le Date: Sun, 25 Sep 2016 22:54:23 -0400 Subject: [PATCH 12/15] Update README.md --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3d5cbfd..25cb66e 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,10 @@ Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust For timing GPU, I wrapped cuda events between kernel launches and for timing CPU, I used the C++11 std::chrono API. Each configuration is run 1000 times, then taken the average as displayed below: -![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png "Logo Title Text 1") +![Scan performance](https://github.com/trungtle/Project2-Stream-Compaction/blob/master/images/ScanPerformaceAnalysis.png "Scan performance") + + +![Compaction performance](https://github.com/trungtle/Project2-Stream-Compaction/blob/master/images/CompactPerformaceAnalysis.png "Compaction performance") As we can see, the CPU version is outperformed by the rest. Thrust is clearly a winner here (probably due to the fact that it was implemented properly). It occurs to me that the 'efficient' version is in fact a bit slower than the naive but is still faster than the CPU version. There are a couple reasons for this: - We're not taking advantage of shared memory inside each block to store the partial sum results. @@ -49,8 +52,7 @@ As we can see, the CPU version is outperformed by the rest. Thrust is clearly a When testing with different block sizes, I found it pretty interesting that at size 128, it seems to be the most optimal. So I decided to use this block size for the rest of profiling -![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png "Logo Title Text 1") - +![Block sizes performance](https://github.com/trungtle/Project2-Stream-Compaction/blob/master/images/BlockSizePerformanceAnalysis.png "Block sizes performance") For more details on the data collected, see [link](https://docs.google.com/spreadsheets/d/1mtohoQ4BtD_RamWI2KeV-HhkSYDMmendWos7sQgdVR8/edit?usp=sharing). To guess at what might be happening inside the Thrust implementation (e.g. allocation, memory copy), take a look at the Nsight timeline for its execution. Your analysis here doesn't have to be detailed, since you aren't even looking at the code for the implementation. @@ -128,4 +130,4 @@ I added a #define PROFILE and #define PROFILE_ITERATIONS flags in a new header f ### Modified CMakeList.txt - Added "ProfilingCommon.h" -- Changed to -arch=sm_52 \ No newline at end of file +- Changed to -arch=sm_52 From ca4cd6b556fe52bf2c03372aced6123bcbd0872e Mon Sep 17 00:00:00 2001 From: Trung Le Date: Sun, 25 Sep 2016 22:55:00 -0400 Subject: [PATCH 13/15] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 25cb66e..e54166a 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,7 @@ As we can see, the CPU version is outperformed by the rest. Thrust is clearly a When testing with different block sizes, I found it pretty interesting that at size 128, it seems to be the most optimal. So I decided to use this block size for the rest of profiling ![Block sizes performance](https://github.com/trungtle/Project2-Stream-Compaction/blob/master/images/BlockSizePerformanceAnalysis.png "Block sizes performance") + For more details on the data collected, see [link](https://docs.google.com/spreadsheets/d/1mtohoQ4BtD_RamWI2KeV-HhkSYDMmendWos7sQgdVR8/edit?usp=sharing). To guess at what might be happening inside the Thrust implementation (e.g. allocation, memory copy), take a look at the Nsight timeline for its execution. Your analysis here doesn't have to be detailed, since you aren't even looking at the code for the implementation. From cd9cdbba90d208dd99ce9fe3741d1bb9a2a83697 Mon Sep 17 00:00:00 2001 From: Trung Le Date: Sun, 25 Sep 2016 23:41:08 -0400 Subject: [PATCH 14/15] Update README with thrust analysis --- README.md | 6 ++++-- images/ThrustCapture.PNG | Bin 0 -> 18171 bytes 2 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 images/ThrustCapture.PNG diff --git a/README.md b/README.md index e54166a..d3f1bfb 100644 --- a/README.md +++ b/README.md @@ -56,8 +56,10 @@ When testing with different block sizes, I found it pretty interesting that at s For more details on the data collected, see [link](https://docs.google.com/spreadsheets/d/1mtohoQ4BtD_RamWI2KeV-HhkSYDMmendWos7sQgdVR8/edit?usp=sharing). -To guess at what might be happening inside the Thrust implementation (e.g. allocation, memory copy), take a look at the Nsight timeline for its execution. Your analysis here doesn't have to be detailed, since you aren't even looking at the code for the implementation. -Write a brief explanation of the phenomena you see here. +I also used NSight to profile thrust performance. It seems that thrust does take advantage of shared memory (24,528 bytes per block). It's occupancy is also lower (50.0%) and it uses more registers per threads compare to my efficient implementation. + +![Thrust performance](https://github.com/trungtle/Project2-Stream-Compaction/blob/master/images/ThrustCapture.PNG "Thrust performance") + # Test output diff --git a/images/ThrustCapture.PNG b/images/ThrustCapture.PNG new file mode 100644 index 0000000000000000000000000000000000000000..862fabaf7d7fda3139727238c6544e0228fc0026 GIT binary patch literal 18171 zcmbTd2UJtt(=Hw`3Q-gZf^-xifOJrLkxuAE>AgsiDn)unh;&2mRZx2G9YK00K?{H?p;Zn{wE`mc(bkvIK?CRAlIj*+-F0K}puVYZVOkh^v&X5kGb_u3Q} z@KGmy>dM@^LewHMi0NBYaoT2e{7?|-qqhf zajE5I-T$A#d7lGwvV?Jd^Uol=n*_wg4hb@2F#g|R;tzfHcLd`=0{&kA_}>5Pg3pWe zzE^c!ZAUJ)tNVU&>unb(zeku$)V|%E*zIq}^L3T3-7t2MP%-M;jOlpq-hlDz-v_Ss zd!6ESARHW=mA)V@5O-h@@n)Id4RVFgV<886tvB9(xkP(=y&sqUbiR71%@^8{ck=Iw z93O}$ck3tBr2_AAf5A}QUL$M&pg5lFCn5pxzmHTOGOB1QP8H1qYBK<1(%vT-Jq&=V zdc?oy7UiJETs~Ns%em(M7HQ+r$598XC0^_+evBRV&)&k5!rjV!bD>GXfyZFY2>Q3P zt@eWKrbb6O-r)_ucFXr><#=-{(`xtUBC&EgpkI0U0z{qc?ijl z%<6doKe{O&<@hVjXsQ%wjMZ@IqO56y6?msV9lcL(lpn3wX>d_=6vYjbyEgep?5_*) z5Oi7VJNgL>CbtZ|I%_BKVxSA{>XS>lX~SPAUP&{G zt|Bg0g4y@Xz)!TjS@zr>gT8f!-5TeO0{I(j)@1NrhxvbCiY!g_iPStIx^+Ik;$V>h zKjCv7J_-6~mnTP)|Ji+92@od;h#kS4?eu@e@woF}VZOn?^Up1Co)i413%UmO7jHlQ zPi+cG6kZ_1YTTAT6g%Sw{FhmW7v^x)g&G)G$^VS#y45q>bVs&O83x5KotCW#0@`_f zZyJS~ea}L>#GKPEPEaOi>%Co=b*tY<6U>j{mFOkv+Zet)Z_z^Z)kL)02SEoT(QRQr z5$^?`!fmk~LDdUdT|+k^$Ff1CRl7vD)4hX)Tt(elo(I!7aKN&+EjLK_b}m9(axj>scP{^9^-MwYsaRqXnan~)Z2*@r6*1gh=ufkF8bA@(A6+p^|M>-hAcq)jD`kVM ziTHVBUtV>lPkQ?TaVjv9?wjO2iq2M=Q!fg;=7X(3d%hZ_U*`Rz&WYn4^Z@>+;gIZ~ zBGLEP^sfsQI~D~g7HLvafOZ&NbX8rcD7DO;Cysu>kd?A)X2VAiu0Zch4RRy_EHn0&3OqS-sy?We~EX5sy@R1*D z--Q5%dspS?x(#nAWj=Ghg0-s>N-bUX$$Z;!j2#+uUY2#m>E??S%e7Tj8NIl9O@PYIo<(oQ{2`5KCzDqSf6DSz&^3 z2o3@-8fW!2S5c)FH_y%9HoLC={>TDk}&utwu9Hs zORCtZoB;3R-n{bW2Q2gdF&7lnN+lZgn1l3`+(g_1nDHh9#qplKSH5%5N(Mj39L^$= zx`}Fc>Om1p(XQN=x;nS^Ddu7LrS=9atdxj9OmQ7`cY z>-snOjKRyVlsQ3dtv6o{&K?%9p3c!TDe=)cWis{_-u36=1Ua%r@qkD_-V8W1tR2h$ zfo_HG@r^lr&dEOEZF>@DjeP|q0qaRqm2x5hJrW#ouDnXGLCym|0%Gp~P@Fw)HiQHRMxoXHUAx~E&u?}|I<7G$zEzx#zB(;>Z#og$NoHI1Sp%^fVnG%hW&k!XPFZ- z!B=5Rnty_Y0U2@Lho`Vs8vq>e6no1m2cGnQ%x9$y32aY>DVc^Oq3hE6OPEfekBE-y zPIaOyquPev)-q@QDbK{j&{^oPWz;HoG#~!@miLn{1KgVo4E@(zw zzzEh4>nBOWu_8TaZl}k71^@T+pc37PdxF0Q2ry3}9Ey*oZ@&vu3S=x{Hj_K_i5j*2 zaPyr2&>r6hmp}9ZFOPRaOhhwHux8`<0=s$joN%Plku?D(o&NT{M#J6w){^SYkxvn# zHQA!;CiX@R>wT^^i)(K#`h16i4kT^}52Mlp19kMtJB_-Ck0RMY`pEMp4 zt18iI1~SH{CC7e970Iu&X4Sr(LgO|M$Q8p1U6%Q!}lPN_P*Zz&--?N06V z?IFn#^s7tZ#%g5)_vn=I>}R7SG`)F0H~)z@TnNHG9>w4LPcPZyVg>2)-;40yUlkQQ zhu~RRoAMD^7mpOIl_hp2eGW!O1`eDBC#fPH)$Ot}i#vutUZ)S_$0v~5>iE15y-9JFmh*V)B#TinOI%N5$3pp*RRV5%qk z{1uVFPj!?kiyA%cC#-ZLTrf4S#ZzZ^t{rb`FxRkct9Uv?Q+9oy#-E-f5&1 zc=Y4mo8Hadu1{1KCkL(==b_ZP0gFRSmtA`<*5>iqz9K&NL+p0DXCG8Vg+iR>7565T z0?E$U0IAu(6;iv)j0l^wynZ@BdlxtNA6@Mm6+uV2c4bM*Ucl8XN*3?Ui=-+^gwuqZ zHy(Xsp&%Wmi#kT&=aQOViV+$u)TJ3}7&hA%HowhG3w`R#(Je@mELANmdBQAdEi%U+ z4W01@@KZ3%D}M32wf8%3U3_`#qN+RGOAYh7NL{-*a6Pk;#StA*7#V%?{v^_KvsZF!FQj(r{l@8>x_L6&qo28^`vnCv-<&}LZUeA?u%w91!jo5 zY^vNp_P_EgGQl^$zkU&T>nFoQePD$R7QoUzUe-2X5QX?3rb%hK(-$V`Dc!1D@uM_m zV9*kcJ(~SzdVphNTJJXGxzA%tIsoOtw>Q1zo)ckFh_O^K!!RKpz30eeaSm=AKp9cUksFQ%$K>zW7 zJIBDF$7uTDTVJuZ57a}RqQv?#VokHd(dBSt=LI3TNNA^W52g5xc7FlP<@f0hx6i`y z^+N#u?OPcE(R=78pV2h>NJU) z0T$%_A_up^Yhx7ejVFH>k+dP*TZuV7x4f-4;zLmi6Ch>4@A8^ww%uRcjd1zc9klmctDmYf2i!mKQ^U`;8niiyKn!E{FHbSRTWtwhPu zulJ^bypjFIG;gbY_=}ZHY{8{_FZ37a374W9l_OmtT#KZ_eBP{KM2uHsL#j~56L*CVY-->S(g{#>ce_3%^dAlK9ooI_240N&*Ne$ zpBeShTcwhe1wRTg1nqJ`o?A4zGOK*i@ST=Y5M0=Jp%i%asbOeY=iJTR>w}FwHp*#^ z^i%1~dvRyfLD2Q{n;0k7<^?HtGhD)-_uz6?Jj)m~NjE#lf}|}O`b(Szna9wZ1k4^# z&VG6&7h&B{z0CwSW{TFV<-4boahbc}l=hDenYgwS)2$jr6Z+j9xfg>FIJECg@S-6C zwG6XRFt^jwq`>|rDei8@f6YkZzn?cCCL1rRKb>r+-&#fC0G3HBh<~%**#5y!Q9Tp9 zBoN@|dDSMbVd>?N8)vM8!#d%jxSBe zH(Y;~9&!Ut4Nk@JVZ+{CpE9X?*NG6)kr_j_q`4!OJZ2BwjsK+}SYw#{sHCW_a`G&0 zhnwc-xk}KM`>h{GUH2R#r*7;1vJ*f*ZI4p^5?Fv4uzaO%wYnVP4-_cyxH#@I5#t7> zL;QLq6<0UARMxAWooG-C_+8@UR9{a-V@AlM*^Dw#*LL4aSx@qH7DUsd`fp`+r%dJ7 z*N`?{)dCq3+mw4FUrUcau%kCKj|A-t#uMi-)vSps>)&t2YsMZ_grhvY*Oq|HyOoWa zet1%L4iWwdMJnI_v>_nb=>E+2W7`^Am+TJYqui5$NJWWhz84iebTX>;I%9%t=MU`! zCub69*Tr~XaegI9Dhlf>sj|yN*=8*uvzEq9xPc!WHXdIOw$6;YEACL>*vTmI4b3_t zu!ezTOg~Z7hoyGJy_VlcKM@Q_%R8|^w8E_h!mkQJmH4fsyW@B^MunHoV&8quyqjFp z&H@$+Xg{|xm-cHQ>qN6oc2=L&<(6o8>>T+lJb3_7dwj^@IWN`n)RgT=_ZmdJsobZW zA0h3)5t))4%omc732dS9@6Cqc!c1&*TGQuHg8kU_`(a6=fQ@5LO{FJ!(Orm%-uI+-2ka4TJHOh@nl96eI zqm~d=A%U6<6PIw{$E|rDO^W_%G0F4g<;aD+k9 zWHsgAFo>jJ5@Az1PeDxZl3SdgVbx=m@<)Y8#?s;>TkZ|br{oqF12As(65_6xSu}~m zwO*s7cjC7PVF%b%_=xDzymh0DXN?ot>Pq=!3A|qw)ohwuBgl#x#p6h@Y*)iL#*IQ;}7o?5$9 zxrIs?TQ%?hc(U6)lpuR@e$>_4^o4Y_S=+y8mg(Mh4@BH2?Mi(r^JE-WlV|5+-2ct) zort?ZYd_(Y+Ena#SRs;|zNK{`&tBSE(aVko9vf_~9m-(SZQPwnSpL=icsT`Qp{u`n zlk&*Lg5Y(V@XOJQHpuV$vf$vR^)(p*fG}7xfM+w8G}!PhvTJU+Y&LV%bf32LRBgV%%BwPD9Yo zbR6gd0FDNOY~+rb%Df;7fywL#ix6CZKK$cVUK56J_~)zxA2vgFr}|Hx$I$oeH1W<~ zj1ISxsree=hZ(_QKGTlpo}nX=aF9=9{FAOt{eBJU56E&qjiZ>qu0>HX>FP3|uqzmFOad;q-SWKlve47n^x?xczz=*mIO(WafCllO4B zfJjC&8vTMS)M1&?c)Xu6_$OJgAJuJXWI#WJzl9sr!agww{0E&?m9PZwlL{Wn{*}1< zz9SDrJoml`h4?>*m;+x($H?6$J05udD34t*vSA_kFQ)70{Qm@ZSJpo{_zC=j1V4-D zRPeXKC}Vay!+K^|EnU)c%|7ZXB%+Yl?xfk zntveY9nKX*S?B`lQ>fUrMh@#KwD?mK!qqg5V_0<8Efnz%5<4=WvVHAW^X-j=-}N40 zs6I*8hG#&knJL)i?modPa5@zW6s=uF(g5l`e$lrcuPSfOLb$%69R}nAN0dH*U!}M| zRLm)FFkF6*eVNM;O%z?QuKn+jA6uS*eBDp_wXCnlbHzs(;m%m^rFxo%w6QRHMD{dG zUHH!2(c`rrj2t7B+RjxTJGZg5MZ8U}7O5t*fN~NnI0VjyYCFZb6A6yBu_mRfj{8-X z+HtH3Joz;Q8S9S|j3)|>o9#!uAc~jeO;We};cTRwJJnSObzs5pb!9hyyb!+v@WbBB zZ@c>t!q3ZNdjCG430f}D?NmEVn#sZd3}uW^vZW@57kZE%zLVJ7vFacBJS|aL+AN8=B@=MoKqK2}(_FMV21ZVJx|Vyv$uZAne)esi8W zBlsC2-l=;u-IIk;QB=w)6CzHL`ixwVYMByR_lSd<7pBGSmlFbJ?o)e_LH$`cJsjDD z+*~Z!R{1EQzutKeL&3W>b*bLEZ0E|?lJJ(C;6%SQWNZ_<9)F$#eQcnI!w(jQ z1GG$Sz(C5o20DqV()RDcAi?t;w+C2%d&{itB+^5VFJE-TcJfH6^BV^=1^SqMq}-N- zZ@22^Zu?jB+&ydiF7W&aj@c(6S{=c}&j{3b9Wp zW;^>#{NCdsIN*#>5!b)T{MGL{q6l77>|1$3a+~~G*whJ&yIB4w7ya*f2;@1ucP9-X zA{MY%B5I;OF7SoNctQ%PV~t7>Afco4m+l^iwS9#v+pO?-jsI-LSGSWq>sK&GU%mOB%iG&xPL+DCjzTwlOdIQV_6jdI3w(G4=4jBo z!@V|e&YT-|hC=^}pzN6x4>uP$=-UcX(>1 z1{1k)EFW8kb5Ek{{Er)(BZtFp`RXd~2M~>QFw}V*GYTGIYFKqM%1qxj1nOo;d=Fi_ z;+_9I9e->=Zx%VcG3!?<=`-9_!h&3$CYcH50(qgV+?y<37FXY(^pjpceGStwhY|Lng$npUc2S#OPtA_t_tJ8yY_Me^Ch52f`0n=Z!Kv|c-bBm;<$TbhV znc-TblJuXr_Oanp!7?Rdmykial%q!v163`obUha7<6;oery!+K=5%h+N8>U^2-2s- z<+((O1!V<4yc8H+T*p{&w7WXvU(xy*c84N?bj{sN80HqJ+$@U>M-fs23jb#N_aobf34}6 zzu>w1%$zu!HGXkob+7sAp1y*FSOkgZwadNmXI_zS1*BA%fztVeU?lKDc_H^21a7YX! z8W4rxN$RdFEVEUZdAdwh><3XRTeglTc};qy))=glST`oYjN0WTh*v|$8UH*XgFj;` z>E}cArAmY&j~OzpGfZY*+RyP>i2-;bIOm*Q-9z?*r$(fg+dEY{n-3VYEx~b!#EW~B zil$zAe;VTMWGJLX{!O&(Njc&yU%-y-k?$_*ZtU$}Qg`6v%+q6m=shseT$Na%lGZxX zUXnrMqIszJd=IDecpn?A{5Qx(j5IvPgJAdOw{IXhV0mYhrJQ!M8E}&T&)|F9a`WU{ zpM%n2WAsYIpKlL5ZzaWOap%P};LCS9hJfGtj6EyK0L#>l#B(ogX`~Aoc@lzNxeOyf zwr_j3-o3j?DZcdcP^Ye!HH9y;JbWGAAffdj={Z4rQ%;EP*i8a%dhf8}>MLLRH}|;D zTjA_Y*D5_hDrUl1$fI6G&uXv$vISu2(zJNF!ArvEmO0FMcrE6Kg4SBs)oiu1T_iU5 zADL@9lz2O{D6ap^ef}j?jVK;eb>|FrQ!GR5Cb&QN%Y>6hND*x(tt+$iHK&rf^?;es zu`U=QjwgG3on7)ExR%)5w2yuqN9*A;u%pPAz74g0ldArUZ48Sivb!6IQ_ncVd1RN3 zN(z>YW^M0snv@HCL`6;&F>xL}2U38TNVkxy1@?c}QetpmynJb2`)+Ifw;--#^k@Y~ zE=+ln!yf$I_Leo2)?jUFc~4+Fxm!D;#r(#vh>%$pLLm-(BrmemIu3>y7|u;71eQpb>|d&4!R^8>0#XUn=*`G@Hw4$m8^GZYybD(~?ZsQPSbCEKk~lj(RWq{0h5@)PqEQ6K z+^H1)X4(#5t;b?X-9+v58Sv$ar-z@C0sK*RFJOCUQV+SJM08j#+Y+Hzt~h4%74xVk zIypelbga%**FY-itWufR-r|w0kO$4@A5|o$P6F52?Qb8gQzvU#fxDI(Vm$vel;o$? zN9LE-DmCt_Uy3NMl8<;vAqvV>}F-n)SYy0}C;*&pIbp%vc;H^Qa48UVLEpf{yQ71S8s%JRo>X}Q&EEoR63v`dVU4dy zi8c@S-(gq7?f`~CXDw#!5fN{Xm{97i?v6KcNoa?ueQfmuTYu)qyOZnj^xi4~0AL(D zkOGV(3%t5P-2Gca?R9()&x)07Ady(n3?~2g%nDKotB-r+|Cfi1X@E+h$)hA_Y#coE z#hCntu0yKC(PW<;tky%1#iiXdx~hhZIIjkL-PZGbNcZljMoIFRD37#&-s^qhslvg> z2D^l>EC`reM`CLjy~W0C?^rTnSXD)^dC@LmCXKvpB z;q}06vg_*w6HLoxVJ7j>F9Kt*G27n?*99Jjze^=T@4okHzhfQyfV{CNW%~b+48_HQ~+K zsiND{;A2Uar6~0tJ9ukpTUqQ;RogSB-N|<*A8&Ls>}d&5EOu3Gq1mH$Zk@{O)q*pv zd^c;;hcDBg-ee}w`Wzm<8TDm!X7WRiPx^V%AEsdH!l2jQ=gg6>^2*R>8H071VZsy_ z@eHq{k8E`=R30b^I?lXvG?urVugczUx!WaV{SD=aijOCb3~uHi2xO+fN($DVUmS|>cPsT$vJpENlQa z7tt}v*m(bwg_Ay_E1u!xv6=nVWGtFh+#-92-fr6C+|BI*!P|Xxxt%52H$rKJNz#*N zJ+34i{1`I-%?5Y)P|er#cv%KqJO5rzHd4KPsL~jGmnih(I3*Dsu9MYjX9Z+1)-hyo zfbY;dUuTDIJz6n4KUNjE1XI=4ink=08sqNIF|eCjo**tQp7x6yj(VB*aA+o?ys=Vf zjNvUs_7p>@1B-Lmz(jakvAoCmZSn4-qs^1=)SUPxkm*USoglwz`{UR)!(Y~zO(NiW z?8B}+p7nnGE?P!?!nxLK!o7^Yg+^RsspNzrxM5ip?rYM~DXDcvn>4~<2zQA$V^1uZ z>o%BG6${+m79eHvO@Fl3(!MGc)=DU8E8Fj;x49g1OSY%2ZtOl;cn=nA?WfOa-MC5F zskS-3I3K+|TvZGhT=ts~d@($%dS3%>eCPb@Er*{#M>~BFHojF?nYVBJ^o_9e1k@+g zEVQ!wk<{O%e8*MEHUMkeCm>cQ#uGE%@d!c@w->~rvqPpZ-BCW3?c#?D8I)7 zd-2LT*zyRp&A!VYS_iS{4yZ{N9pyI!f%9F-cDk|$k8QW%6RZLrP zoT6qz4lEzQj}=4^;cAT0GYJ-@CN9-qz0l#6!4B_m7;#{-DoP{n3oA`$C>u zBTpN}Y#RyA_6v=8^xLZ2Y|@L*qft$r^pm`coVt@IE%KX)!qUsWgtc5AdxzNKu9v#f-CfA@jxjqd^~ZTFK{2pZ zVe&)fG3MaxB@G2qYvY|^lGHj<Xp4e zYp=5tRnxa)7fp7RTTetrai{t#1}vX+j9RZht6pch`Mz~zo~o95_hl%0`4Z_O6GQTv z@P+?(2OnC5??~bCxM;j`P5i*{_PJkX=r+2x%?|me)qeEoZEfkNKC|zwp6SmwbtHy_ zR?w=bMz@{aB)8_VTF8qNgY1Edr6s|m3yAiXMO%dZYa;W%YwTU+hpRIF zxGtKw+|^I}SqOX6CL&$ZZq;Ffk?kZAw}=?}Tz*)gdSNG(W|wsmH*E7(AED+YGvM9k z^vh@R^v+5@cjwMo!Oo}nsIjZ|vZCYNAFf6QMk86b1|o%vwez*-TLcbCdzi(2$edJrPDu7L6H(o}#QKVO|m$AAazt@Yd2zUlU>t7YA zBDaKLq>L46u4=g3b+GPIqs4IRYzN8(&zIIyZa-*f^cLTkqx+R0u}s@nyZ3#XY`ioP zh~H?c6Pzhz0&@Js6>OF2^5@&B%t<55miDOw*>@Hn%Cu#n2;=A@rBUD4t5js8lk_uy z10j;L9tb=Oj$DMqz~g<<_358{n@Bg)^@zv&yaS;JybrWOm^0H2Rp_eB!EbB9` zXe&x2E&01g9owHf*N@gJ^*qL9J%kMTNKrMAu_-NtMt)JA#kH`-q>RkIWZ-H;R1peA zKIx|_+hT-ouUD>Y)jQ*UBeEjJYdR$tc$trHpT+?b`zsI~PM8b#R8j09Yzh=DNPd_4 zsaovyv4cvzot*L83|+Ys3~yV-LHV;nZUt~^bR^2oO^Bt^ld^MTX)Y^~L#*E3W3K2l zz<9bcMOYprr1F7@5sXhvicOY7MN$U9DQdnIuQP zfA$w*SULh_N@J0FV}^B7L}%4KN3S&RCTteb{u}o%U+D3_GdCXyec!C zLV4lZl0|p@<+q41Czp@+=R|@VLZDCJYG<6lh((le!&UYErJL!^rg7p_PfK_hxAH1bg&# zid~)XcK_zM;gZ%E??VmRDTfk2*)nnsvgQp16ZPquFs07SwZB0ik_(2Ywtptotfc$t zH>5*mV`-lw-J4!(f$!cdZ6fjBnU|b0f8m8V*0QVd5Z-cAL&ZL2}4WvPWU7NcY`|e z8l{MHb|GUwMeO&2N>jgYGG%sDvPoD^Jeo^5D$w$4bM@G$7Q?J<*hMG>_LC2BA_jNt zF13EowdL<3Og}b3KcD5oC4=Q2Vy0m2TD=wxVDZHs9~#|)sr9Y(=Amx-bM)$aA5vn! zKy_?Pxs!ji>`M2s#G6H44JjpMrTLmo1(Uj47-3kS7$SS(aG$ND3|%kQufDu_dvS4_C?K)w`}{OKC?5*8&4bfp*|a3^4wAPApREXk;}TrQaQbS~+5Jx-|kNeUE-R5CNTd zPB%HNAi}V>qj*uq zbNkM9!a_@b!C?NBaH^|@p$>jk?dsbv**o$825bE^L?`PCD|JIP#zV4?GAMwtKgT8+ z%INqVE=@|*7Vf#y)UuVKXR7U=SVVNx>GfJ0#oJ9_dgzZgC8lG}nu@}=mj5%*ZDo&w4YFg*YZYGtUX)K^*weHY6| zcaC@GJJQJd%kRIJ=LyRAn~E%?f7A`u8(M>P z>UFB=elyp9Y_gk?>Z$8+(Frj@;@avGos2)qRLRslkt~s&i}s}#uglB1l&$AYy=5qD z?$|O{K$iWKhszb-S$q&BurdWJR$*_76k~o6v>vyq`pRYQT5KVZIi&dU4mJqZL`IE! z1f(QMDT&RoOP)~HvWTCIQY)sJgMCwjZHmg;qOM%y{K7x1xLMR+Uu_hc$(#j6Ye66# zsIS^}pU-xOW`37*?rTio>Fs>7gD-nV|BRSZm6FF8aSjwlzN_#3W_qwOnM^V@Af`H; zR2j@BK~5P!_W3pXNhcmQj-H5i_>HZoj)bU`dbupvX7>6t{yOjy-PKPSxv7D!8a{#J7|DPNkAr}${v2V~8`l;W5!Xgswp20U#M=s= znBdR9={)V?^Wy7^s=`x!HFw8O7!=!-PgCtuOJzHci;5EW6qYdFo$K`|G`k^8F(>>~A4o>iMH0`-EG^qJVfhK)aqh%-Q#5VsV za{+Ed0d+Le@1p-QS}-q|OE-BLtBU5eHbU&$rj#yGwfWxM9t7z^Zw&39j$wY%zu!fU zpMKyXhpq)R^Op-co%G37*Dv<8daTuLAMu$^Byfn&DPTP&{(%lW~&v(~uSsrePvUoJzy#B!$qE70dmyqvv@mpdudqTaCx)haE z+jbg+3M|VcHtF0Wt6)z0LK8ji^DgLk6T1 z#0IAhvHU9D@e#zlMeOy6UvZ{o3JX;C6<)qoy?$ z%dYseIJ|_u;>86`%k4eZ*nUk-Nem2dKn;r9%2WbF!$4c>h7-S4=6RouGs ziBcvQk}Zm-hS>BREqHDt@VoN&slx4RpL5s~$9MOZ7;=6WJUz7qUz1EU)1ue-U7z>7FMl1`^Jv&`yOCKoJ87 zHL3K_l{};=1UMj}e5jwuBjiR^hWIghv-o-%9AsnCfJ{nIP-UHIa^Q3h1w@{fYDb|p& zr%GM5#TO00g69&<>aw>gAhB{ME8YW3QQ~5gr^O`>{(FgoyznqZrFFHd_oI)d9F|40 zeVo^LKy#t%)2f%5eqn{z>%i)nJ20zNF~An9S1&F6n>52(LDFpWrLZWF6zm17kldMeTVfzY)Ti$~JN+N?Vt zS#sV}KZqE%+~$LDD-&#I*{Subgd6e3g5%m>i@PAL;LBK?HYIeVsqJ!u_YsS-oX+I% z*8-A_*C!A8`{={!|5W~o_YJQZ1Y$ROY`uw;R2Go@S8@Xko?z~5r3wB&BM&q7K zFz0jhXskjAy=;%`8ryR@xj>JJu&gw&CBUAT8-y>Mgae=X(m?$h%aGp#w#Ek|rHmgb zn%?nF?VFFHz;+ANMbDe+drkweNjo-J#WpegA2k>(msI~2$M*asvx?(`{*Aiu17k8_ zAC zDpaXc-%@|VLr-dcqgqK41yU=Jy{ST1F}m!*c?TfMY&f-6BKi`VJwaAa@qF;>8 zRg@rncw;Fr*!Ty)-S!OcLT2uD#rVl@SajUF2aLG>1 z!-a)w>5oq87>_NbW`s{e_fZ1E8S{*gL-YzG#n6_En$dMRUtOKNP@$5A4rPng!{m8+ zFLmBj;HOGb8MZhUDUAo1uh3B(^R^dKpYu@bsF~IpdOS&;#5SGD|II)rLwO(W5VlK? zeS4z@mUsQY)LX-EBY23wf~xK`?`-*C-4e4vN|{y>$i`T3^6wy|dX>w!j7GLgOt78G z2fXzfkGLZ#TVBw0Mlicx*`4JtOkCdx52rX9#w$u%@fCW{wsL(X;pe=>Q=Q%PnsIES z9`st6bY^}Nj$?_%qvemzG?4T@g@`E|Y2M0W6(2nuCmA?dK_c~Ncif}26Uf+YTP9Z4 z%X6cLY=N_&t2Zt}9Yb_i3AHD5@*VemY{4J;Np*b`W?hTS6a!Jm3^zu%A8`{A-SgaU z{pRg=1W>!*?5VS-6*BZ>z*MngfqjycpS_f0vKx!JKQVE z|F{cueTzrjr5j$ovG{aeY+rHWajxc1_AEcqt-2n_CBN|0pw;+2_!-?RK7f6KOLldt zku+^@{?S4hrVbypy@l% zr&C@*TG5mn3%5_^Qbl~({cq-jlgnZxW`{SmXn?ZYx$8t<{gT@e=d~l5#LQa)#82j% zLtbA<+2QnZy#@(9o;Po};WBl81do9_zjt%xF28{$XN)xSN^W>TSKNrfP-CT~7D`|4 zHBV#kLvSZAk9I`15}5{kLE9y@a9m(PB?f)*I_KC;V0YqTla(J+y>vrhYX;5*IeEv< z?X*EM3)z-@I=E+kyYA_h~LJ&J~P5 zR;T?S<=JxmEw~u%I(B%CG2Br;gr}p!NGt)0gP2+D_~I zU%mF5E7)+RmY!$y80Ga?EB|+y*-&?)q#Piu7B$2WF0G@y9bPZMMVXkY1nCu=dDdL7 zEza&ScRA~=H%s^4@XuUhGPKZ5P!U;Zn0~GhaTjp{g`qK1TY1eUPVFUWkXM&^7$R@yJQ9-HA%<>++qdU3?;9jyHEXj+B!o zAFYqZQ?9e9()hw(>Kg_t&QFIQ5vie1rcmAnF5Yidh?+|6vZ1OpwYEz}y6!&BIj6Mw z);>EsUE(|QKKBf#xY!eSQ0M`l%*Y?4L-nE`-}eOc=T+{0w#y7u#z($up@FIzA-%4v zmtXH`Es_D7gD+5}0~g&`g(OM^rspX$QC2%og7kjoJ=N;q zh?eFE1k%aNdbl5AUv}5LFijPkCQ=RRGZd2_V_1GpDM4!6eph{)ceAOYA!1-ZJh045 z0=uM7<>^7EA=EcOpB!@5Q0Obu=Pq@rR|ty)1_2%){$Yqzsm% zf?tI}odyq?HIJ~BWaNsfAAZ5|mZ%kT9zV$D0Wr2#(+=B%L0?AfQO6ewBHKy~W>$N$ zeltL9C&Gh^Zl60*CRhU0?zJ0BR(iGP1&V$53F9+YY6ng(STLIz1V}gix6X`1gDX>{hWS{Ql z>nFn`oy1d&;fGb2jNmsX8a^u4JEx5;jXt&P`mzp=uh}mWt81>UFgw0-1BbJbC&j!V zM(9ZmwdLw{DJ;ZyAwfRk(u@tOZBmBIlID(re zof{THT;Y}z%`j4(i-mTrjApKh*>Th<+Rwsg_76?X$ zVT=}0;?d`y)+WkJ(+kx)~ei?a3i8a2SxzleMNhPb!oV zchx6@j#lJE{_Gx^HS9+XyzKIVVCj(5E5ViGLR~O|a4C+`KqH!PLm)^lF6!uD_2XCQ zxgEBcs|mE5FU!4tr~jairE6pVigS;Hwro%_W17Ka`z*6}zizNv#g5kYJFTLrNGx^{ zileP+xC&Rln<23?-W-`xv8DuW?t5M-dwe2N6Wg(y3T~jZcM%^ zfk^F;=z;r+5{MTjmpxNO@w?*;^?uI!1uwjtxgVQ+ODICJ?@}^EpKZEh3X*n%95Hjk z^@ifqZ$@n3%jaI04B+->3Q)}VNxqmZ_hf8uU~Al`Z>L9xd|r|$4=l|_Jbiv136eC7 zT7omJ3j`>Z|Jtb5!ic~sMeuA2j*Y(Kp6T=eSWXv z73MYnAzn|s|adFyvv9+1xpR^}}8PgB!+#nwV zy!r-tj%)K1*v9At`Q<%g{E@5bHV`^5bZFlAw|y2D%giW9lUw~ zaoWeWv|=%Gy313Q4)4?Fw@*Uw-;pw4&2-KuBO3uD!3F=dQ|FtEZgsz*)6m^S-GXx3 zy9P@J6m}GMB2LQ;TNQHC@_h(in!&SfGOl<+F4-8&z=DyN!kk#@mi~-wsTXx`z9`k%vCiLX&j0mHx=+6z)%_Bl=dbJhp1=82`lbFq`)|#A zy?;x+rTx|ZJ^YD6*&olV{OW&wapT1O?n``_QNRU-` zhLnDkuHK^MpBBQJ6T(7TanFw2IF`G2Rm0AwF450lFM7PVDk!db+s>++e{yf`+us!+ zY^QhT_v-xk`~11}PioZuM*QBj;M&bSMGKC+{ITh4+$!S>E>3x8mHS1$cr<Fu|l^VQbg68^S0zu0MM@2l(j@=utZ+|&*x#UG8Wl`Z|? z)AYUAY~qOm1_d?#b*>q2ayv?t9)DNa(Hr_;$C^vxelJQO75$csUv^Laoh_W(_u(Vg zblES{v$yxP=N5mxS{Q5J^g8~M+wbY0!%kbu`XuQpCocb+Fwc34e>uNT(~8Ou`wlGM zba2K>PXQfkO^4IjHeBkY6e=}acmXEnzd->b? z$UgS#vnAsGZZ>@K@6V5?8-CyESE<+%_$$VCf!XQc1Jk1&Az5d=4d5WymTU1J-|90Q*bD69W6eYb7Jur0Y@3t9`s{xV PPz8ghtDnm{r-UW|qxlst literal 0 HcmV?d00001 From dd30d570b7be10ea0b53c0652c60bf4db80e8509 Mon Sep 17 00:00:00 2001 From: Trung Le Date: Tue, 27 Sep 2016 16:28:55 -0400 Subject: [PATCH 15/15] Modified the way to find the remaining elements count to be O(1) instead of O(logn) as before --- README.md | 4 +++- stream_compaction/common.cu | 29 ----------------------------- stream_compaction/common.h | 3 --- stream_compaction/efficient.cu | 12 ++++++++---- 4 files changed, 11 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index d3f1bfb..c5b8d8b 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,6 @@ As we can see, the CPU version is outperformed by the rest. Thrust is clearly a - We're not taking advantage of shared memory inside each block to store the partial sum results. - Each level of upsweep/downsweep currently launches a new kernel. It would be ideal to use the same kernel and compute the next level there without having to transfer the control back to the CPU. - At deeper level in the upsweep/downsweep calls, there are a lot of idle threads not doing work. This is wasting a lot of GPU cycles. -- In the stream compaction phase, in order to find the number of remaining elements after compaction, I launched a new kernel to search for the maximum value in the prefix-sum array that is used to index into the output array. This could be a potential bottle neck but I haven't tested a different version to compare. - There are quite a bit of memory transfering between GPU & CPU, which initially slowed the application down alot. So I rewrote my scan and compaction functions to minimize this memory transfer. When testing with different block sizes, I found it pretty interesting that at size 128, it seems to be the most optimal. So I decided to use this block size for the rest of profiling @@ -128,9 +127,12 @@ Runtime: 2.01408 ms ``` ## Note + ### Modified test I added a #define PROFILE and #define PROFILE_ITERATIONS flags in a new header file "profilingcommon.h". When this is on, running main() will also iterate through each function call PROFILE_ITERATIONS number of times, then measure the execution time and average it for profiling analysis. +I also increased the reserved stack size in VS to 0x40000000 to prevent stack overflow for larger array size + ### Modified CMakeList.txt - Added "ProfilingCommon.h" - Changed to -arch=sm_52 diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index cf8450a..33080b3 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -14,35 +14,6 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { exit(EXIT_FAILURE); } -/* Max reduction is really just the partial sum upsweep algorithm */ -__global__ void maxReduction(int n, int level, int* odata) { - int tid = threadIdx.x + (blockIdx.x * blockDim.x); - if (tid >= n) { - return; - } - - int twoToLevel = powf(2, level); - int twoToLevelPlusOne = powf(2, level + 1); - if (tid % twoToLevelPlusOne == 0) { - odata[tid + twoToLevelPlusOne - 1] = imax(odata[tid + twoToLevel - 1], odata[tid + twoToLevelPlusOne - 1]); - } -} - -int findMaxInDeviceArray(int n, int *dev_idata) { - - int height = ilog2ceil(n); - - - for (int level = 0; level < height; ++level) { - maxReduction << > >(n, level, dev_idata); - } - - int maxValue = 0; - cudaMemcpy(&maxValue, dev_idata + n - 1, sizeof(int), cudaMemcpyDeviceToHost); - - return maxValue; -} - namespace StreamCompaction { namespace Common { diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 26a03e1..e3eedd3 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -33,9 +33,6 @@ inline int ilog2ceil(int x) { return ilog2(x - 1) + 1; } -int findMaxInDeviceArray(int n, int *idata); - - namespace StreamCompaction { namespace Common { __global__ void inclusiveToExclusiveScanResult(int n, int* odata, const int* idata); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 3f23434..123309a 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -39,8 +39,8 @@ __global__ void downsweep(int n, int level, int* odata) { } // Should only be launched with 1 thread? -__global__ void remainingElementsCountForCompact(const int boolIndex, int* dev_indices, const int* dev_bools, int* remainingElementsCount) { - *remainingElementsCount = dev_bools[boolIndex] == 1 ? boolIndex : boolIndex; +__global__ void kernRemainingElementsCountForCompact(const int n, int* dev_indices, const int* dev_bools, size_t* remainingElementsCount) { + *remainingElementsCount = dev_bools[n - 1] + dev_indices[n - 1]; } void deviceScan(int n, int* dev_odata) { @@ -160,7 +160,11 @@ int compact(int n, int *odata, const int *idata, float* timeElapsedMs) { Common::kernScatter << > >(ceilPower2, dev_odata, dev_idata, dev_bools, dev_indices); // The max value of all the valid indices for the compacted stream is the number of remaining elements - int remainingElementsCount = findMaxInDeviceArray(ceilPower2, dev_indices); + size_t* dev_remainingElementCount; + cudaMalloc((void**)&dev_remainingElementCount, sizeof(size_t)); + kernRemainingElementsCountForCompact<<<1, 1>>>(ceilPower2, dev_indices, dev_bools, dev_remainingElementCount); + size_t remainingElementCount = 0; + cudaMemcpy(&remainingElementCount, dev_remainingElementCount, sizeof(size_t), cudaMemcpyDeviceToHost); #ifdef PROFILE // -- End code block to profile @@ -182,7 +186,7 @@ int compact(int n, int *odata, const int *idata, float* timeElapsedMs) { cudaEventElapsedTime(&milliseconds, start, stop); *timeElapsedMs = milliseconds; #endif - return remainingElementsCount; + return remainingElementCount; } }