CIS565-Fall-2016 · WindyDarian · Sep 17, 2016 · Sep 17, 2016 · Sep 17, 2016 · Sep 17, 2016
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -9,6 +9,7 @@ set(CMAKE_CXX_STANDARD 11)
 
 list(APPEND CUDA_NVCC_FLAGS_DEBUG -G -g)
 list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBUGINFO -lineinfo)
+list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
 
 # Crucial magic for CUDA linking
 find_package(Threads REQUIRED)

diff --git a/README.md b/README.md
diff --git a/screenshots/chart_blocksize.png b/screenshots/chart_blocksize.png
diff --git a/screenshots/chart_compact.png b/screenshots/chart_compact.png
diff --git a/screenshots/chart_scan.png b/screenshots/chart_scan.png
diff --git a/screenshots/chart_scan_optimization.png b/screenshots/chart_scan_optimization.png
diff --git a/screenshots/chart_sort.png b/screenshots/chart_sort.png
diff --git a/screenshots/preview.gif b/screenshots/preview.gif
diff --git a/screenshots/preview_optimized.gif b/screenshots/preview_optimized.gif
diff --git a/src/main.cpp b/src/main.cpp
@@ -1,89 +1,131 @@
 /**
  * @file      main.cpp
  * @brief     Stream compaction test program
- * @authors   Kai Ninomiya
- * @date      2015
+ * @authors   Kai Ninomiya, Ruoyu Fan
+ * @date      2015, 2016
  * @copyright University of Pennsylvania
  */
 
-#include <cstdio>
 #include <stream_compaction/cpu.h>
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
+#include <stream_compaction/radix_sort.h>
+
 #include "testing_helpers.hpp"
+#include <iterator>
+#include <algorithm>
+#include <iostream>
+
+// size of 1 << 26 could on a 970 desktop 
+//   but crashed on my laptop (970m) so I reduced array size
+// change the array size if there is still problem
+const int SIZE = 1 << 25; //constexpr 
+//const int SIZE = 1 << 24;
+const int NPOT = SIZE - 3;
+const int SCAN_MAX = 50;
+const int COMPACTION_MAX = 4;
+
+const int SORT_SIZE = 1 << 25; 
+//const int SORT_SIZE = 1 << 24;
+const int SORT_NPOT = SORT_SIZE - 3;
+const int SORT_MAX = 1000000000;
+
+
+int a[SIZE], b[SIZE], c[SIZE], d[SORT_SIZE], e[SORT_SIZE], f[SORT_SIZE];
 
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
-    const int NPOT = SIZE - 3;
-    int a[SIZE], b[SIZE], c[SIZE];
 
     // Scan tests
+
+    std::cout << "CIS-565 HW2 CUDA Stream Compaction Test (Ruoyu Fan)";
+    std::cout << std::endl;
+    std::cout << "    Block size for naive scan: " << StreamCompaction::Naive::getNaiveScanBlockSize() << std::endl;
+    std::cout << "    Block size for up-sweep: " << StreamCompaction::Efficient::getUpSweepBlockSize() << std::endl;
+    std::cout << "    Block size for down-sweep: " << StreamCompaction::Efficient::getDownSweepBlockSize() << std::endl;
+    std::cout << "    Block size for boolean mapping: " << StreamCompaction::Common::getMapToBooleanBlockSize() << std::endl;
+    std::cout << "    Block size for scattering: " << StreamCompaction::Common::getScatterBlocksize() << std::endl;
+    std::cout << "    Block sizes for radix sort: " 
+        << StreamCompaction::RadixSort::getComputeBArrayBlockSize() << " "
+        << StreamCompaction::RadixSort::getComputeDArrayBlockSize() << " "
+        << StreamCompaction::RadixSort::getComputeEArrayBlockSize() << " "
+        << StreamCompaction::RadixSort::getReshuffleBlockSize() << std::endl;
 
     printf("\n");
     printf("****************\n");
     printf("** SCAN TESTS **\n");
     printf("****************\n");
+    std::cout << "Array size (power of two): " << SIZE << std::endl;
+    std::cout << "Array size (non-power of two): " << NPOT << std::endl;
 
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+    genArray(SIZE - 1, a, SCAN_MAX);  // result for edge case of 0 looks fine
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
     zeroArray(SIZE, b);
     printDesc("cpu scan, power-of-two");
     StreamCompaction::CPU::scan(SIZE, b, a);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     printArray(SIZE, b, true);
 
     zeroArray(SIZE, c);
     printDesc("cpu scan, non-power-of-two");
     StreamCompaction::CPU::scan(NPOT, c, a);
     printArray(NPOT, b, true);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
+    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
-    //printArray(SIZE, c, true);
+    printArray(NPOT, c, true);
+    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
+    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
+    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     printCmpResult(NPOT, b, c);
 
     printf("\n");
     printf("*****************************\n");
     printf("** STREAM COMPACTION TESTS **\n");
     printf("*****************************\n");
-
+    std::cout << "Array size (power of two): " << SIZE << std::endl;
+    std::cout << "Array size (non-power of two): " << NPOT << std::endl;
     // Compaction tests
 
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+    genArray(SIZE - 1, a, COMPACTION_MAX);  // result for edge case of 0 looks fine
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
@@ -94,30 +136,82 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
     expectedCount = count;
     printArray(count, b, true);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     printCmpLenResult(count, expectedCount, b, b);
 
     zeroArray(SIZE, c);
     printDesc("cpu compact without scan, non-power-of-two");
     count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
     expectedNPOT = count;
     printArray(count, c, true);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     printCmpLenResult(count, expectedNPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("cpu compact with scan");
     count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
     printArray(count, c, true);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    //printArray(count, c, true);
+    printArray(count, c, true);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    //printArray(count, c, true);
+    printArray(count, c, true);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     printCmpLenResult(count, expectedNPOT, b, c);
+
+    printf("\n");
+    printf("*****************************\n");
+    printf("** RADIX SORT TESTS **\n");
+    printf("*****************************\n");
+    std::cout << "Array size (power of two): " << SORT_SIZE << std::endl;
+    std::cout << "Array size (non-power of two): " << SORT_NPOT << std::endl;
+    std::cout << "Max value: " << SORT_MAX << std::endl;
+
+    genArray(SORT_SIZE - 1, d, SORT_MAX);
+    d[SORT_SIZE - 1] = 0;
+    printArray(SORT_SIZE, d, true);
+
+    printDesc("std::sort, power-of-two");
+    std::copy(std::begin(d), std::end(d), std::begin(e));
+    StreamCompaction::CPU::stdSort(std::begin(e), std::end(e));
+    printArray(SORT_SIZE, e, true);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+
+    printDesc("thrust::sort (which calls Thrust's radix sort), power-of-two");
+    std::copy(std::begin(d), std::end(d), std::begin(f));
+    StreamCompaction::Thrust::sort(std::begin(f), std::end(f));
+    printArray(SORT_SIZE, f, true);
+     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printCmpResult(SORT_SIZE, e, f);
+	// I wanted to compare with thrust's unstable and stable sort, but it uses radix sort!
+
+    printDesc("radix sort, power-of-two");
+    std::copy(std::begin(d), std::end(d), std::begin(f));
+    StreamCompaction::RadixSort::radixSort(std::begin(f), std::end(f), SORT_MAX);
+    printArray(SORT_SIZE, f, true);
+    printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printCmpResult(SORT_SIZE, e, f);
+
+    // must be after all power-of-two sorts since it is standard value
+    printDesc("std::sort, non power-of-two");
+    std::copy(std::begin(d), std::end(d), std::begin(e));
+    StreamCompaction::CPU::stdSort(std::begin(e), std::begin(e) + SORT_NPOT);
+    printArray(SORT_NPOT, e, true);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+
+    printDesc("radix sort, non power-of-two");
+    std::copy(std::begin(d), std::end(d), std::begin(f));
+    StreamCompaction::RadixSort::radixSort(std::begin(f), std::begin(f) + SORT_NPOT, SORT_MAX);
+    printArray(SORT_NPOT, f, true);
+    printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printCmpResult(SORT_NPOT, e, f);
 }
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
@@ -1,6 +1,9 @@
 #pragma once
 
 #include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <string>
 
 template<typename T>
 int cmpArrays(int n, T *a, T *b) {
@@ -17,6 +20,12 @@ void printDesc(const char *desc) {
     printf("==== %s ====\n", desc);
 }
 
+template<typename T>
+void printElapsedTime(T time, std::string note = "")
+{
+    std::cout << "   elapsed time: " << time << "ms    " << note << std::endl;
+}
+
 template<typename T>
 void printCmpResult(int n, T *a, T *b) {
     printf("    %s \n",
@@ -40,10 +49,19 @@ void zeroArray(int n, int *a) {
 }
 
 void genArray(int n, int *a, int maxval) {
-    srand(0);
+    //srand(0);
+    srand(time(nullptr));
 
-    for (int i = 0; i < n; i++) {
-        a[i] = rand() % maxval;
+    for (int i = 0; i < n; i++) 
+    {
+        if (maxval == 0)
+        {
+            a[i] = 0;
+        }
+        else
+        {
+            a[i] = rand() % maxval;
+        }
     }
 }
 

diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
@@ -9,6 +9,8 @@ set(SOURCE_FILES
     "efficient.cu"
     "thrust.h"
     "thrust.cu"
+    "radix_sort.h"
+    "radix_sort.cu"
     )
 
 cuda_add_library(stream_compaction

diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -1,5 +1,8 @@
 #include "common.h"
 
+#include <cuda.h>
+#include <cuda_runtime.h>
+
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
     if (cudaSuccess == err) {
@@ -22,17 +25,61 @@ namespace Common {
  * Maps an array to an array of 0s and 1s for stream compaction. Elements
  * which map to 0 will be removed, and elements which map to 1 will be kept.
  */
-__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-    // TODO
+__global__ void kernMapToBoolean(int n, int *bools, const int *idata) 
+{
+    auto index = threadIdx.x + blockIdx.x * blockDim.x;
+    if (index >= n) { return; }
+
+    if (idata[index])
+    {
+        bools[index] = 1;
+    }
+    else
+    {
+        bools[index] = 0;
+    }
 }
 
+
+//__global__ void kernScatter(int n, int *odata,
+//        const int *idata, const int *bools, const int *indices) 
+
 /**
- * Performs scatter on an array. That is, for each element in idata,
- * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
- */
+* Performs scatter on an array. That is, for each element in idata,
+* if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+*/
 __global__ void kernScatter(int n, int *odata,
-        const int *idata, const int *bools, const int *indices) {
-    // TODO
+            const int *idata, const int *indices) 
+{
+    // use one less buffer to save space
+    auto index = threadIdx.x + blockIdx.x * blockDim.x;
+    if (index >= n) { return; }
+
+    if (idata[index])
+    {
+        odata[indices[index]] = idata[index];
+    }
+}
+
+int getMapToBooleanBlockSize()
+{
+    // not thread-safe
+    static int block_size = -1;
+    if (block_size == -1)
+    {
+        block_size = calculateBlockSizeForDeviceFunction(kernMapToBoolean);
+    }
+    return block_size;
+}
+int getScatterBlocksize()
+{
+    // not thread-safe
+    static int block_size = -1;
+    if (block_size == -1)
+    {
+        block_size = calculateBlockSizeForDeviceFunction(kernScatter);
+    }
+    return block_size;
 }
 
 }