Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
32be56f
CPU scan and compact
WindyDarian Sep 17, 2016
5e06ef2
explicit cast
WindyDarian Sep 17, 2016
71c9419
just a temperorary log
WindyDarian Sep 17, 2016
10961d8
Naive scan
WindyDarian Sep 17, 2016
2b92f22
work efficient scan
WindyDarian Sep 18, 2016
8f798f5
TODO->DONE
WindyDarian Sep 18, 2016
940fe19
typo
WindyDarian Sep 18, 2016
9852e9c
allow scan to be called with device array?
WindyDarian Sep 18, 2016
f9bf48f
no that idea was not necessary
WindyDarian Sep 18, 2016
eaeb2d2
WAIT DID YOU SAY EXCLUSIVE SCAN
WindyDarian Sep 18, 2016
0b16d3c
exclusive work-efficient scan (was inclusive)
WindyDarian Sep 18, 2016
8ddb117
GPU work-efficient compact
WindyDarian Sep 18, 2016
ea62ecb
todo->done
WindyDarian Sep 18, 2016
f7aa0c7
thrust scan
WindyDarian Sep 18, 2016
3c79532
framework and unit test for radix sort
WindyDarian Sep 18, 2016
ebabb79
radix sort finished
WindyDarian Sep 18, 2016
778322b
measurement helpers
WindyDarian Sep 18, 2016
447d877
brief readme
WindyDarian Sep 18, 2016
d233cbd
readme image and blocksize
WindyDarian Sep 18, 2016
55d2459
optimization: reduced amount of threads used for efficient sweep
WindyDarian Sep 19, 2016
b957b5a
finializing
WindyDarian Sep 19, 2016
d783eac
readme
WindyDarian Sep 19, 2016
3cbd491
more description
WindyDarian Sep 19, 2016
6262096
That should do?
WindyDarian Sep 19, 2016
26a55f6
additional test results
WindyDarian Sep 19, 2016
ada5d32
tables
WindyDarian Sep 19, 2016
ec5deec
tested on my laptop
WindyDarian Sep 19, 2016
0eb039c
fixed thrust::sort measurement
WindyDarian Sep 23, 2016
37ee8be
typo
WindyDarian Sep 23, 2016
0ac719d
beautifu
WindyDarian Sep 23, 2016
7a8b19f
tested build on ubuntu and set extended part of array to zero at radi…
WindyDarian Sep 24, 2016
73f315e
charts
WindyDarian Sep 27, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ set(CMAKE_CXX_STANDARD 11)

list(APPEND CUDA_NVCC_FLAGS_DEBUG -G -g)
list(APPEND CUDA_NVCC_FLAGS_RELWITHDEBUGINFO -lineinfo)
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")

# Crucial magic for CUDA linking
find_package(Threads REQUIRED)
Expand Down
467 changes: 462 additions & 5 deletions README.md

Large diffs are not rendered by default.

Binary file added screenshots/chart_blocksize.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added screenshots/chart_compact.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added screenshots/chart_scan.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added screenshots/chart_scan_optimization.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added screenshots/chart_sort.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added screenshots/preview.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added screenshots/preview_optimized.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
128 changes: 111 additions & 17 deletions src/main.cpp
Original file line number Diff line number Diff line change
@@ -1,89 +1,131 @@
/**
* @file main.cpp
* @brief Stream compaction test program
* @authors Kai Ninomiya
* @date 2015
* @authors Kai Ninomiya, Ruoyu Fan
* @date 2015, 2016
* @copyright University of Pennsylvania
*/

#include <cstdio>
#include <stream_compaction/cpu.h>
#include <stream_compaction/naive.h>
#include <stream_compaction/efficient.h>
#include <stream_compaction/thrust.h>
#include <stream_compaction/radix_sort.h>

#include "testing_helpers.hpp"
#include <iterator>
#include <algorithm>
#include <iostream>

// size of 1 << 26 could on a 970 desktop
// but crashed on my laptop (970m) so I reduced array size
// change the array size if there is still problem
const int SIZE = 1 << 25; //constexpr
//const int SIZE = 1 << 24;
const int NPOT = SIZE - 3;
const int SCAN_MAX = 50;
const int COMPACTION_MAX = 4;

const int SORT_SIZE = 1 << 25;
//const int SORT_SIZE = 1 << 24;
const int SORT_NPOT = SORT_SIZE - 3;
const int SORT_MAX = 1000000000;


int a[SIZE], b[SIZE], c[SIZE], d[SORT_SIZE], e[SORT_SIZE], f[SORT_SIZE];

int main(int argc, char* argv[]) {
const int SIZE = 1 << 8;
const int NPOT = SIZE - 3;
int a[SIZE], b[SIZE], c[SIZE];

// Scan tests

std::cout << "CIS-565 HW2 CUDA Stream Compaction Test (Ruoyu Fan)";
std::cout << std::endl;
std::cout << " Block size for naive scan: " << StreamCompaction::Naive::getNaiveScanBlockSize() << std::endl;
std::cout << " Block size for up-sweep: " << StreamCompaction::Efficient::getUpSweepBlockSize() << std::endl;
std::cout << " Block size for down-sweep: " << StreamCompaction::Efficient::getDownSweepBlockSize() << std::endl;
std::cout << " Block size for boolean mapping: " << StreamCompaction::Common::getMapToBooleanBlockSize() << std::endl;
std::cout << " Block size for scattering: " << StreamCompaction::Common::getScatterBlocksize() << std::endl;
std::cout << " Block sizes for radix sort: "
<< StreamCompaction::RadixSort::getComputeBArrayBlockSize() << " "
<< StreamCompaction::RadixSort::getComputeDArrayBlockSize() << " "
<< StreamCompaction::RadixSort::getComputeEArrayBlockSize() << " "
<< StreamCompaction::RadixSort::getReshuffleBlockSize() << std::endl;

printf("\n");
printf("****************\n");
printf("** SCAN TESTS **\n");
printf("****************\n");
std::cout << "Array size (power of two): " << SIZE << std::endl;
std::cout << "Array size (non-power of two): " << NPOT << std::endl;

genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case
genArray(SIZE - 1, a, SCAN_MAX); // result for edge case of 0 looks fine
a[SIZE - 1] = 0;
printArray(SIZE, a, true);

zeroArray(SIZE, b);
printDesc("cpu scan, power-of-two");
StreamCompaction::CPU::scan(SIZE, b, a);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printArray(SIZE, b, true);

zeroArray(SIZE, c);
printDesc("cpu scan, non-power-of-two");
StreamCompaction::CPU::scan(NPOT, c, a);
printArray(NPOT, b, true);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("naive scan, non-power-of-two");
StreamCompaction::Naive::scan(NPOT, c, a);
//printArray(SIZE, c, true);
printArray(NPOT, c, true);
printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, power-of-two");
StreamCompaction::Efficient::scan(SIZE, c, a);
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient scan, non-power-of-two");
StreamCompaction::Efficient::scan(NPOT, c, a);
//printArray(NPOT, c, true);
printArray(NPOT, c, true);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printCmpResult(NPOT, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, power-of-two");
StreamCompaction::Thrust::scan(SIZE, c, a);
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
printDesc("thrust scan, non-power-of-two");
StreamCompaction::Thrust::scan(NPOT, c, a);
//printArray(NPOT, c, true);
printArray(NPOT, c, true);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printCmpResult(NPOT, b, c);

printf("\n");
printf("*****************************\n");
printf("** STREAM COMPACTION TESTS **\n");
printf("*****************************\n");

std::cout << "Array size (power of two): " << SIZE << std::endl;
std::cout << "Array size (non-power of two): " << NPOT << std::endl;
// Compaction tests

genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case
genArray(SIZE - 1, a, COMPACTION_MAX); // result for edge case of 0 looks fine
a[SIZE - 1] = 0;
printArray(SIZE, a, true);

Expand All @@ -94,30 +136,82 @@ int main(int argc, char* argv[]) {
count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
expectedCount = count;
printArray(count, b, true);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printCmpLenResult(count, expectedCount, b, b);

zeroArray(SIZE, c);
printDesc("cpu compact without scan, non-power-of-two");
count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
expectedNPOT = count;
printArray(count, c, true);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printCmpLenResult(count, expectedNPOT, b, c);

zeroArray(SIZE, c);
printDesc("cpu compact with scan");
count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
printArray(count, c, true);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, power-of-two");
count = StreamCompaction::Efficient::compact(SIZE, c, a);
//printArray(count, c, true);
printArray(count, c, true);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printCmpLenResult(count, expectedCount, b, c);

zeroArray(SIZE, c);
printDesc("work-efficient compact, non-power-of-two");
count = StreamCompaction::Efficient::compact(NPOT, c, a);
//printArray(count, c, true);
printArray(count, c, true);
printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printCmpLenResult(count, expectedNPOT, b, c);

printf("\n");
printf("*****************************\n");
printf("** RADIX SORT TESTS **\n");
printf("*****************************\n");
std::cout << "Array size (power of two): " << SORT_SIZE << std::endl;
std::cout << "Array size (non-power of two): " << SORT_NPOT << std::endl;
std::cout << "Max value: " << SORT_MAX << std::endl;

genArray(SORT_SIZE - 1, d, SORT_MAX);
d[SORT_SIZE - 1] = 0;
printArray(SORT_SIZE, d, true);

printDesc("std::sort, power-of-two");
std::copy(std::begin(d), std::end(d), std::begin(e));
StreamCompaction::CPU::stdSort(std::begin(e), std::end(e));
printArray(SORT_SIZE, e, true);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");

printDesc("thrust::sort (which calls Thrust's radix sort), power-of-two");
std::copy(std::begin(d), std::end(d), std::begin(f));
StreamCompaction::Thrust::sort(std::begin(f), std::end(f));
printArray(SORT_SIZE, f, true);
printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printCmpResult(SORT_SIZE, e, f);
// I wanted to compare with thrust's unstable and stable sort, but it uses radix sort!

printDesc("radix sort, power-of-two");
std::copy(std::begin(d), std::end(d), std::begin(f));
StreamCompaction::RadixSort::radixSort(std::begin(f), std::end(f), SORT_MAX);
printArray(SORT_SIZE, f, true);
printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printCmpResult(SORT_SIZE, e, f);

// must be after all power-of-two sorts since it is standard value
printDesc("std::sort, non power-of-two");
std::copy(std::begin(d), std::end(d), std::begin(e));
StreamCompaction::CPU::stdSort(std::begin(e), std::begin(e) + SORT_NPOT);
printArray(SORT_NPOT, e, true);
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");

printDesc("radix sort, non power-of-two");
std::copy(std::begin(d), std::end(d), std::begin(f));
StreamCompaction::RadixSort::radixSort(std::begin(f), std::begin(f) + SORT_NPOT, SORT_MAX);
printArray(SORT_NPOT, f, true);
printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
printCmpResult(SORT_NPOT, e, f);
}
24 changes: 21 additions & 3 deletions src/testing_helpers.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#pragma once

#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>

template<typename T>
int cmpArrays(int n, T *a, T *b) {
Expand All @@ -17,6 +20,12 @@ void printDesc(const char *desc) {
printf("==== %s ====\n", desc);
}

template<typename T>
void printElapsedTime(T time, std::string note = "")
{
std::cout << " elapsed time: " << time << "ms " << note << std::endl;
}

template<typename T>
void printCmpResult(int n, T *a, T *b) {
printf(" %s \n",
Expand All @@ -40,10 +49,19 @@ void zeroArray(int n, int *a) {
}

void genArray(int n, int *a, int maxval) {
srand(0);
//srand(0);
srand(time(nullptr));

for (int i = 0; i < n; i++) {
a[i] = rand() % maxval;
for (int i = 0; i < n; i++)
{
if (maxval == 0)
{
a[i] = 0;
}
else
{
a[i] = rand() % maxval;
}
}
}

Expand Down
2 changes: 2 additions & 0 deletions stream_compaction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ set(SOURCE_FILES
"efficient.cu"
"thrust.h"
"thrust.cu"
"radix_sort.h"
"radix_sort.cu"
)

cuda_add_library(stream_compaction
Expand Down
61 changes: 54 additions & 7 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#include "common.h"

#include <cuda.h>
#include <cuda_runtime.h>

void checkCUDAErrorFn(const char *msg, const char *file, int line) {
cudaError_t err = cudaGetLastError();
if (cudaSuccess == err) {
Expand All @@ -22,17 +25,61 @@ namespace Common {
* Maps an array to an array of 0s and 1s for stream compaction. Elements
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
__global__ void kernMapToBoolean(int n, int *bools, const int *idata)
{
auto index = threadIdx.x + blockIdx.x * blockDim.x;
if (index >= n) { return; }

if (idata[index])
{
bools[index] = 1;
}
else
{
bools[index] = 0;
}
}


//__global__ void kernScatter(int n, int *odata,
// const int *idata, const int *bools, const int *indices)

/**
* Performs scatter on an array. That is, for each element in idata,
* if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
*/
* Performs scatter on an array. That is, for each element in idata,
* if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
const int *idata, const int *indices)
{
// use one less buffer to save space
auto index = threadIdx.x + blockIdx.x * blockDim.x;
if (index >= n) { return; }

if (idata[index])
{
odata[indices[index]] = idata[index];
}
}

int getMapToBooleanBlockSize()
{
// not thread-safe
static int block_size = -1;
if (block_size == -1)
{
block_size = calculateBlockSizeForDeviceFunction(kernMapToBoolean);
}
return block_size;
}
int getScatterBlocksize()
{
// not thread-safe
static int block_size = -1;
if (block_size == -1)
{
block_size = calculateBlockSizeForDeviceFunction(kernScatter);
}
return block_size;
}

}
Expand Down
Loading