diff --git a/README.md b/README.md
index b71c458..5938fd3 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,214 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Jian Ru
+* Tested on: Windows 10, i7-4850 @ 2.30GHz 16GB, GT 750M 2GB (Personal)
 
-### (TODO: Your README)
+### List of Features
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+* Batch scan (faster than thrust implementation even when data size is as large as 2^26)
+* CPU scan and stream compaction
+* Naive GPU scan
+* Work-efficient GPU scan and stream compaction (used shared memory)
+* Thrust scan
+* GPU radix sort (added radix_sort.h and radix_sort.cu)
+* Custom tests
 
+### Performance Analysis
+
+  ![](img/sc_perf1.png)
+
+* Optimizing Block Sizes
+  * 128 or 256 turn out to be the best. Unless you use non-multiple of warp size,
+  really small (like 32), or really big (like 1024) block size, the performance difference
+  is not really that huge
+  * The only exception is work-efficient scan. Since reducing block size increases the number
+  of sub-sums and likely increases the number of recursions, it enjoys a larger block size
+  (512 turns out to be optimal on my machine). Even if so, using 1024 turns out to be slower.
+  It may due to reduced concurrent execution, since all the threads in a block need to be
+  synchtronized at 3 different places in the kernel. Increasing block size means fewer threads
+  can terminate and leave multiprocessor early. So it is likely to hurt performance.
+
+* Array Size vs. Execution Time (ms)
+  * Exclusive scan
+  
+  ![](img/scan_perf.png)
+
+  | Array Size | Naive  | Efficient | Thrust | CPU    | Batch |
+  | ---------- | -----  | --------- | ------ | ---    | ----- |
+  | 2^8        | 0.03   | 0.01      | 0.02   | 0.00   | 0.01  |
+  | 2^15       | 0.42   | 0.07      | 0.70   | 0.08   | 0.03  |
+  | 2^20       | 5.72   | 1.39      | 0.93   | 2.31   | 0.42  |
+  | 2^25       | 223.64 | 43.40     | 18.15  | 77.18  | 12.39 |
+  | 2^26       | 465.44 | 86.81     | 35.85  | 153.48 | 24.75 |
+
+  * Stream compaction
+  
+  ![](img/sc_perf2.png)
+
+  | Array Size | Efficient | CPU (no scan) | CPU (with scan) | Batch |
+  | ---        | ---       | ---           | ---             | ---   |
+  | 2^8        | 0.09      | 0.00          | 0.00            | 0.11  |
+  | 2^15       | 0.16      | 0.09          | 0.20            | 0.15  |
+  | 2^20       | 3.78      | 2.70          | 6.03            | 1.61  |
+  | 2^25       | 72.91     | 85.98         | 196.08          | 42.37 |
+  | 2^26       | 143.49    | 174.08        | 391.90          | 81.53 |
+  
+  * Radix sort
+  
+  ![](img/rs_perf.png)
+
+  | Array Size | Batch | Thrust |
+  | ---        | ---       | --- |
+  | 2^8  | 0.42 | 0.58 |
+  | 2^15 | 2.44 | 0.34 |
+  | 2^20 | 47.35 | 5.80 |
+  | 2^25 | 1468.98 | 116.92 |
+
+* Performance Bottlenecks
+  * CUDA runtime APIs for memory manipulation (e.g. cudaMalloc, cudaMemcpy, cudaFree) are super expensive
+  * Naive scan
+    * Excessive global memory access
+	* Too many kernel calls (each level of up/down-sweep require one kernel call)
+  * Work-efficient scan
+    * Too many threads and in efficient memory I/O
+      * Threads exceed the number of cores on GPU will queue up. Too many threads also increases
+      management and scheduling overhead
+      * Each thread only read in two 4-byte data and the two 4-byte are read in using two separate
+      instructions. GPU, due to its SIMD nature, is better in handling vector I/O. That is why
+      batch scan loads and stores a vec4 in each thread
+    * Too many kernel calls when array size is large
+	* In my implementation, if array size exceed 2 times block size, two more calls are
+	required: one scan the sub-sums and another one scatter scanned sub-sums to corresponding
+	blocks. If the size of sub-sum array is further greater than 2 times block size, another
+	two extra calls are generated and this recursive behavior will go on as array size gets
+	larger and larger.
+	* Thrust's scan implementation stablizes at 3 kernel calls despite of the size of data.
+	Thus even though my implementation is comparable or even slightly faster than thrust scan,
+	it becomes much slower when array size becomes really large (2^25 for example)
+    * Bank conflicts at the beginning but have been resolved now
+	* Algorithm not good enough (too much computation) when compared to thrust's implementation 
+	bacause my scan kernel, on average, takes twice as much time to execute as thrust's kernel
+  * Thrust scan
+    * Additional cudaMalloc and cudaFree calls
+	* Judging from the performance analysis timeline, thrust is doing cudaMalloc and cudaFree
+	even if I pass in thrust::device_ptr. This causes thrust scan become slower than my
+	work-efficient scan when array size is small
+  * Work-effcient stream compaction
+    * My implementation is basically a wrapper on work-efficient scan plus two light-weight
+	kernels, kernMapToBoolean and kernScatter. So the bottlenecks are the same as work efficient
+	scan
+  * Self-implemented radix sort
+    * Excessive kernel calls
+	* Scan kernel implementation not good enough when compared with thrust's
+    * My implementation is super naive. I basically separate elements into 0 bin and 1 bin and
+	do this for every bit. So if the type has 32 bits, there will be 3 * 32 kernel invocations
+	or more when array size gets large.
+	* On the other hand, judging from performance analysis timeline, thrust's implementation
+	has 3 * 7 kernel invacations all the time and one of the kernel is super cheap (took less
+	than 10 microseconds to execute on 2^25 array size). Moreover, even the two more expensive
+	kernels run much faster than my scan kernel on large data size
+	
+* Sample Output
+
+```
+
+GeForce GT 750M [sm_30]
+Array Size: 1048576
+Test group size: 100
+Note:
+    1. Execution time is the average over @TEST_GROUP_SIZE times exections
+    2. Runtime API memory operations were excluded from time measurement
+
+****************
+** RADIX SORT TESTS **
+****************
+    [   3   4   3   2   0   2   0   0   0   2   3   0   2 ...   1   0 ]
+==== parallel radix sort, power-of-two ====
+    Execution Time: 109.91ms
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...   4   4 ]
+    passed
+==== parallel radix sort, non power-of-two ====
+    Execution Time: 113.13ms
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...   4   4 ]
+    passed
+==== thrust sort, power-of-two ====
+    Execution Time: 3.55ms
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...   4   4 ]
+    passed
+==== thrust sort, non power-of-two ====
+    Execution Time: 3.41ms
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...   4   4 ]
+    passed
+
+****************
+** SCAN TESTS **
+****************
+==== cpu scan, power-of-two ====
+    Execution Time: 2.32ms
+    [   0   3   7  10  12  12  14  14  14  14  16  19  19 ... 2098448 2098449 ]
+==== cpu scan, non-power-of-two ====
+    Execution Time: 2.39ms
+    [   0   3   7  10  12  12  14  14  14  14  16  19  19 ... 2098443 2098447 ]
+    passed
+==== naive scan, power-of-two ====
+    Execution Time: 5.73ms
+    [   0   3   7  10  12  12  14  14  14  14  16  19  19 ... 2098448 2098449 ]
+    passed
+==== naive scan, non-power-of-two ====
+    Execution Time: 5.72ms
+    [   0   3   7  10  12  12  14  14  14  14  16  19  19 ... 2098443 2098447 ]
+    passed
+==== work-efficient scan, power-of-two ====
+    Execution Time: 1.39ms
+    [   0   3   7  10  12  12  14  14  14  14  16  19  19 ... 2098448 2098449 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+    Execution Time: 1.39ms
+    [   0   3   7  10  12  12  14  14  14  14  16  19  19 ... 2098443 2098447 ]
+    passed
+==== batch scan, power-of-two ====
+    Execution Time: 0.42ms
+    [   0   3   7  10  12  12  14  14  14  14  16  19  19 ... 2098448 2098449 ]
+    passed
+==== batch scan, non-power-of-two ====
+    Execution Time: 0.42ms
+    [   0   3   7  10  12  12  14  14  14  14  16  19  19 ... 2098443 2098447 ]
+    passed
+==== thrust scan, power-of-two ====
+    Execution Time: 0.93ms
+    [   0   3   7  10  12  12  14  14  14  14  16  19  19 ... 2098448 2098449 ]
+    passed
+==== thrust scan, non-power-of-two ====
+    Execution Time: 0.93ms
+    [   0   3   7  10  12  12  14  14  14  14  16  19  19 ... 2098443 2098447 ]
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   2   1   3   1   1   1   2   0   1   0   2 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+    Execution Time: 2.97ms
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+    Execution Time: 2.65ms
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== cpu compact with scan ====
+    Execution Time: 5.90ms
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+    Execution Time: 2.57ms
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+    Execution Time: 2.57ms
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   2 ]
+    passed
+Press any key to continue . . .
+```
+	
+	
diff --git a/img/rs_perf.png b/img/rs_perf.png
new file mode 100644
index 0000000..66e6753
Binary files /dev/null and b/img/rs_perf.png differ
diff --git a/img/sc_perf1.png b/img/sc_perf1.png
new file mode 100644
index 0000000..c128ba5
Binary files /dev/null and b/img/sc_perf1.png differ
diff --git a/img/sc_perf2.png b/img/sc_perf2.png
new file mode 100644
index 0000000..689d8e4
Binary files /dev/null and b/img/sc_perf2.png differ
diff --git a/img/scan_perf.png b/img/scan_perf.png
new file mode 100644
index 0000000..fa9a251
Binary files /dev/null and b/img/scan_perf.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 675da35..38e555d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -6,74 +6,247 @@
  * @copyright University of Pennsylvania
  */
 
+
+#include <ctime>
+#include <iostream>
+#include <iomanip>
 #include <cstdio>
+#define MEASURE_EXEC_TIME
 #include <stream_compaction/cpu.h>
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
+#include <stream_compaction/radix_sort.h>
 #include "testing_helpers.hpp"
 
+
+#define TEST_GROUP_SIZE 100
+
+
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
+    const int SIZE = 1 << 20;
     const int NPOT = SIZE - 3;
-    int a[SIZE], b[SIZE], c[SIZE];
+	int *a = new int[SIZE];
+	int *b = new int[SIZE];
+	int *c = new int[SIZE];
 
-    // Scan tests
+	printGPUInfo(0);
+	std::cout << "Array Size: " << SIZE << '\n'
+		<< "Test group size: " << TEST_GROUP_SIZE << '\n'
+		<< "Note:\n"
+		<< "    1. Execution time is the average over @TEST_GROUP_SIZE times exections\n"
+		<< "    2. Runtime API memory operations were excluded from time measurement\n";
+	//system("pause");
 
+    // Radix sort test
     printf("\n");
     printf("****************\n");
-    printf("** SCAN TESTS **\n");
+    printf("** RADIX SORT TESTS **\n");
     printf("****************\n");
 
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+    genArray(SIZE - 1, a, 5);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
+	float accumExecTime = 0.f;
+	zeroArray(SIZE, b);
+	printDesc("parallel radix sort, power-of-two");
+	try
+	{
+		for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+		{
+			accumExecTime += ParallelRadixSort::sort(SIZE, reinterpret_cast<uint32_t *>(b), reinterpret_cast<uint32_t *>(a), 0xffffffff);
+			if (i == 0) accumExecTime = 0.f;
+		}
+	}
+	catch (std::exception &e)
+	{
+		std::cout << "    ParallelRadixSort::sort: Error: " << e.what() << std::endl;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+	printArray(SIZE, b, true);
+	testSorted(SIZE, b);
+
+	accumExecTime = 0.f;
+	zeroArray(SIZE, b);
+	printDesc("parallel radix sort, non power-of-two");
+	try
+	{
+		for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+		{
+			accumExecTime += ParallelRadixSort::sort(NPOT, reinterpret_cast<uint32_t *>(b), reinterpret_cast<uint32_t *>(a), 0xffffffff);
+			if (i == 0) accumExecTime = 0.f;
+		}
+	}
+	catch (std::exception &e)
+	{
+		std::cout << "    ParallelRadixSort::sort: Error: " << e.what() << std::endl;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+	printArray(NPOT, b, true);
+	testSorted(NPOT, b);
+
+	//system("pause");
+
+	accumExecTime = 0.f;
+	zeroArray(SIZE, b);
+	printDesc("thrust sort, power-of-two");
+	try
+	{
+		for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+		{
+			accumExecTime += ParallelRadixSort::thrustSort(SIZE, reinterpret_cast<uint32_t *>(b), reinterpret_cast<uint32_t *>(a));
+			if (i == 0) accumExecTime = 0.f;
+		}
+	}
+	catch (std::exception &e)
+	{
+		std::cout << "    ParallelRadixSort::thrustSort: Error: " << e.what() << std::endl;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+	printArray(SIZE, b, true);
+	testSorted(SIZE, b);
+
+	accumExecTime = 0.f;
+	zeroArray(SIZE, b);
+	printDesc("thrust sort, non power-of-two");
+	try
+	{
+		for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+		{
+			accumExecTime += ParallelRadixSort::thrustSort(NPOT, reinterpret_cast<uint32_t *>(b), reinterpret_cast<uint32_t *>(a));
+			if (i == 0) accumExecTime = 0.f;
+		}
+	}
+	catch (std::exception &e)
+	{
+		std::cout << "    ParallelRadixSort::thrustSort: Error: " << e.what() << std::endl;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+	printArray(NPOT, b, true);
+	testSorted(NPOT, b);
+
+	// Scan tests
+	printf("\n");
+	printf("****************\n");
+	printf("** SCAN TESTS **\n");
+	printf("****************\n");
+
     zeroArray(SIZE, b);
     printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
+	clock_t start = clock();
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+		StreamCompaction::CPU::scan(SIZE, b, a);
+	clock_t end = clock();
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << double(end - start) / CLOCKS_PER_SEC * 1000 / TEST_GROUP_SIZE << "ms\n";
     printArray(SIZE, b, true);
 
     zeroArray(SIZE, c);
     printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
+	start = clock();
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+		StreamCompaction::CPU::scan(NPOT, c, a);
+	end = clock();
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << double(end - start) / CLOCKS_PER_SEC * 1000 / TEST_GROUP_SIZE << "ms\n";
     printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+	accumExecTime = 0.f;
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		accumExecTime += StreamCompaction::Naive::scan(SIZE, c, a);
+		if (i == 0) accumExecTime = 0.f;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
-    //printArray(SIZE, c, true);
+	accumExecTime = 0.f;
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		accumExecTime += StreamCompaction::Naive::scan(NPOT, c, a);
+		if (i == 0) accumExecTime = 0.f;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+	accumExecTime = 0.f;
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		accumExecTime += StreamCompaction::Efficient::scan(SIZE, c, a);
+		if (i == 0) accumExecTime = 0.f;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
-    //printArray(NPOT, c, true);
+	accumExecTime = 0.f;
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		accumExecTime += StreamCompaction::Efficient::scan(NPOT, c, a);
+		if (i == 0) accumExecTime = 0.f;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
+	zeroArray(SIZE, c);
+	printDesc("batch scan, power-of-two");
+	accumExecTime = 0.f;
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		accumExecTime += StreamCompaction::Efficient4::scan(
+			SIZE, reinterpret_cast<unsigned *>(c), reinterpret_cast<unsigned *>(a));
+		if (i == 0) accumExecTime = 0.f;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+	printArray(SIZE, c, true);
+	printCmpResult(SIZE, b, c);
+
+	zeroArray(SIZE, c);
+	printDesc("batch scan, non-power-of-two");
+	accumExecTime = 0.f;
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		accumExecTime += StreamCompaction::Efficient4::scan(
+			NPOT, reinterpret_cast<unsigned *>(c), reinterpret_cast<unsigned *>(a));
+		if (i == 0) accumExecTime = 0.f;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+	printArray(NPOT, c, true);
+	printCmpResult(NPOT, b, c);
+
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+	accumExecTime = 0.f;
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		accumExecTime += StreamCompaction::Thrust::scan(SIZE, c, a);
+		if (i == 0) accumExecTime = 0.f;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
-    //printArray(NPOT, c, true);
+	accumExecTime = 0.f;
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		accumExecTime += StreamCompaction::Thrust::scan(NPOT, c, a);
+		if (i == 0) accumExecTime = 0.f;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -82,6 +255,8 @@ int main(int argc, char* argv[]) {
     printf("*****************************\n");
 
     // Compaction tests
+	//clock_t start;
+	//clock_t end;
 
     genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
@@ -91,33 +266,103 @@ int main(int argc, char* argv[]) {
 
     zeroArray(SIZE, b);
     printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+	start = clock();
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+	}
+	end = clock();
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << double(end - start) / CLOCKS_PER_SEC * 1000 / TEST_GROUP_SIZE << "ms\n";
     expectedCount = count;
     printArray(count, b, true);
     printCmpLenResult(count, expectedCount, b, b);
 
     zeroArray(SIZE, c);
     printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+	start = clock();
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+	}
+	end = clock();
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << double(end - start) / CLOCKS_PER_SEC * 1000 / TEST_GROUP_SIZE << "ms\n";
     expectedNPOT = count;
     printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+	start = clock();
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+	}
+	end = clock();
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << double(end - start) / CLOCKS_PER_SEC * 1000 / TEST_GROUP_SIZE << "ms\n";
     printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
+	float et;
+	accumExecTime = 0.f;
+	zeroArray(SIZE, c);
+	printDesc("work-efficient compact, power-of-two");
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		count = StreamCompaction::Efficient::compact(SIZE, c, a, &et);
+		if (i != 0) accumExecTime += et;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+	printArray(count, c, true);
+	printCmpLenResult(count, expectedCount, b, c);
 
-    zeroArray(SIZE, c);
-    printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
+	accumExecTime = 0.f;
+	zeroArray(SIZE, c);
+	printDesc("work-efficient compact, non-power-of-two");
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		count = StreamCompaction::Efficient::compact(NPOT, c, a, &et);
+		if (i != 0) accumExecTime += et;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+	printArray(count, c, true);
+	printCmpLenResult(count, expectedNPOT, b, c);
+
+	accumExecTime = 0.f;
+	zeroArray(SIZE, c);
+	printDesc("batch compact, power-of-two");
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		count = StreamCompaction::Efficient4::compact(
+			SIZE,
+			reinterpret_cast<unsigned *>(c),
+			reinterpret_cast<unsigned *>(a),
+			&et);
+		if (i != 0) accumExecTime += et;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+	printArray(count, c, true);
+	printCmpLenResult(count, expectedCount, b, c);
+
+	accumExecTime = 0.f;
+	zeroArray(SIZE, c);
+	printDesc("batch compact, non-power-of-two");
+	for (int i = 0; i < TEST_GROUP_SIZE; ++i)
+	{
+		count = StreamCompaction::Efficient4::compact(
+			NPOT,
+			reinterpret_cast<unsigned *>(c),
+			reinterpret_cast<unsigned *>(a),
+			&et);
+		if (i != 0) accumExecTime += et;
+	}
+	std::cout << "    Execution Time: " << std::fixed << std::setprecision(2) << accumExecTime / (TEST_GROUP_SIZE - 1) << "ms\n";
+	printArray(count, c, true);
+	printCmpLenResult(count, expectedNPOT, b, c);
+
+	delete[] a;
+	delete[] b;
+	delete[] c;
+
+	//system("pause");
+	return 0;
 }
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index f6b572f..a7e8738 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -1,6 +1,9 @@
 #pragma once
 
 #include <cstdlib>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
 
 template<typename T>
 int cmpArrays(int n, T *a, T *b) {
@@ -59,3 +62,34 @@ void printArray(int n, int *a, bool abridged = false) {
     printf("]\n");
 }
 
+template <class T>
+void testSorted(int n, const T *data, bool lessThan = true)
+{
+	if (n <= 0 || !data)
+	{
+		printf("    testSorted: Error: Invalid argument(s)\n");
+		return;
+	}
+
+	for (int i = 0; i < n - 1; ++i)
+	{
+		if ((lessThan && (data[i] > data[i + 1])) ||
+			(!lessThan && (data[i] < data[i + 1])))
+		{
+			printf("    Failed: Element %d %s Element %d\n", i, lessThan ? ">" : "<", i + 1);
+			return;
+		}
+	}
+
+	printf("    passed\n");
+}
+
+void printGPUInfo(int device)
+{
+	cudaDeviceProp deviceProp;
+
+	cudaGetDeviceProperties(&deviceProp, device);
+
+	std::cout << '\n'
+		<< deviceProp.name << " [sm_" << deviceProp.major << deviceProp.minor << "]\n";
+}
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..fcda682 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -9,9 +9,11 @@ set(SOURCE_FILES
     "efficient.cu"
     "thrust.h"
     "thrust.cu"
+	"radix_sort.h"
+	"radix_sort.cu"
     )
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_30
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index fe872d4..eb0acc0 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,5 +1,8 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
 #include "common.h"
 
+
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
     if (cudaSuccess == err) {
@@ -22,8 +25,13 @@ namespace Common {
  * Maps an array to an array of 0s and 1s for stream compaction. Elements
  * which map to 0 will be removed, and elements which map to 1 will be kept.
  */
-__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-    // TODO
+__global__ void kernMapToBoolean(int n, int *bools, const int *idata)
+{
+	int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (tid >= n) return;
+
+	bools[tid] = static_cast<int>(idata[tid] != 0);
 }
 
 /**
@@ -31,8 +39,17 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
  * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
  */
 __global__ void kernScatter(int n, int *odata,
-        const int *idata, const int *bools, const int *indices) {
-    // TODO
+        const int *idata, const int *bools, const int *indices)
+{
+	int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (tid >= n) return;
+
+	if (bools[tid])
+	{
+		int idx = indices[tid];
+		odata[idx] = idata[tid];
+	}
 }
 
 }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index 4f52663..3e32071 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -3,6 +3,7 @@
 #include <cstdio>
 #include <cstring>
 #include <cmath>
+#include <exception>
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
@@ -31,5 +32,65 @@ namespace Common {
 
     __global__ void kernScatter(int n, int *odata,
             const int *idata, const int *bools, const int *indices);
+
+	class MyCudaTimer
+	{
+	public:
+		MyCudaTimer() : elapsedTimeMs(0.f), isActive(false)
+		{
+			cudaEventCreate(&begin);
+			cudaEventCreate(&end);
+		}
+
+		virtual ~MyCudaTimer()
+		{
+			cudaEventDestroy(begin);
+			cudaEventDestroy(end);
+		}
+
+		void start()
+		{
+			if (!isActive)
+			{
+				cudaEventRecord(begin);
+				isActive = true;
+			}
+		}
+
+		void stop()
+		{
+			if (isActive) cudaEventRecord(end);
+			isActive = false;
+		}
+
+		void restart()
+		{
+			stop();
+			clear();
+			start();
+		}
+
+		void clear()
+		{
+			cudaEventSynchronize(end);
+			elapsedTimeMs = 0.f;
+		}
+
+		float duration()
+		{
+			float et;
+
+			cudaEventSynchronize(end);
+			cudaEventElapsedTime(&et, begin, end);
+			elapsedTimeMs += et;
+
+			return elapsedTimeMs;
+		}
+
+	private:
+		cudaEvent_t begin, end;
+		float elapsedTimeMs;
+		bool isActive;
+	};
 }
 }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..d4ef883 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -8,8 +8,28 @@ namespace CPU {
  * CPU scan (prefix sum).
  */
 void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+	if (n <= 0 || !odata || !idata)
+	{
+		return;
+	}
+
+	int pre, cur;
+
+	for (int i = 0; i < n; ++i)
+	{
+		cur = idata[i];
+
+		if (i == 0)
+		{
+			odata[i] = 0;
+		}
+		else
+		{
+			odata[i] = pre + odata[i - 1];
+		}
+
+		pre = cur;
+	}
 }
 
 /**
@@ -18,8 +38,22 @@ void scan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+	if (n <= 0 || !odata || !idata)
+	{
+		return -1;
+	}
+
+	const int *obegin = odata;
+
+	for (int i = 0; i < n; ++i)
+	{
+		if (idata[i])
+		{
+			*odata++ = idata[i];
+		}
+	}
+
+	return static_cast<int>(odata - obegin);
 }
 
 /**
@@ -28,8 +62,40 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  * @returns the number of elements remaining after compaction.
  */
 int compactWithScan(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
+	if (n <= 0 || !odata || !idata || odata == idata)
+	{
+		return -1;
+	}
+
+	if (n == 1 && idata[0])
+	{
+		odata[0] = idata[0];
+		return 1;
+	}
+
+	for (int i = 0; i < n; ++i)
+	{
+		odata[i] = static_cast<int>(idata[i] != 0);
+	}
+
+	scan(n, odata, odata);
+
+	int odataSize = odata[n - 1];
+
+	for (int i = 0; i < n - 1; ++i)
+	{
+		if (odata[i] != odata[i + 1])
+		{
+			odata[odata[i]] = idata[i];
+		}
+	}
+
+	if (idata[n - 1])
+	{
+		odata[odataSize] = idata[n - 1];
+		++odataSize;
+	}
+	return odataSize;
 }
 
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..c67abe3 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -1,34 +1,674 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include "common.h"
+#define MEASURE_EXEC_TIME
 #include "efficient.h"
+#include <vector>
+#include <algorithm>
+
 
 namespace StreamCompaction {
-namespace Efficient {
+	namespace Efficient {
 
-// TODO: __global__
+#ifdef USING_SHARED_MEMORY
+		__global__ void kernScan(int segSize, int * __restrict__ blockSums, int * __restrict__ odata, const int * __restrict__ idata)
+		{
+			extern __shared__ int temp[];
 
-/**
- * Performs prefix-sum (aka scan) on idata, storing the result into odata.
- */
-void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
-}
+			const int base = blockIdx.x * segSize;
+			const int tid = threadIdx.x;
+			const int i1 = 2 * tid + 1;
+			const int i2 = 2 * tid + 2;
+			int offset = 1;
+			int ai, bi;
 
-/**
- * Performs stream compaction on idata, storing the result into odata.
- * All zeroes are discarded.
- *
- * @param n      The number of elements in idata.
- * @param odata  The array into which to store elements.
- * @param idata  The array of elements to compact.
- * @returns      The number of elements remaining after compaction.
- */
-int compact(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
-}
+			// cache data
+			int gidx1 = base + tid;
+			int gidx2 = gidx1 + blockDim.x;
+			int lidx1 = tid + CONFLICT_FREE_OFFSET(tid);
+			int lidx2 = tid + (segSize >> 1) + CONFLICT_FREE_OFFSET(tid + (segSize >> 1));
+			temp[lidx1] = idata[gidx1];
+			temp[lidx2] = idata[gidx2];
 
-}
+			// up sweep
+			for (int d = segSize >> 1; d > 0; d >>= 1)
+			{
+				__syncthreads();
+
+				if (tid < d)
+				{
+					ai = offset * i1 - 1;
+					bi = offset * i2 - 1;
+					ai += CONFLICT_FREE_OFFSET(ai);
+					bi += CONFLICT_FREE_OFFSET(bi);
+
+					temp[bi] += temp[ai];
+				}
+
+				offset *= 2;
+			}
+
+			if (tid == 0)
+			{
+				int idx = segSize - 1 + CONFLICT_FREE_OFFSET(segSize - 1);
+				if (blockSums) blockSums[blockIdx.x] = temp[idx];
+				temp[idx] = 0;
+			}
+
+			// down sweep
+			for (int d = 1; d < segSize; d *= 2)
+			{
+				offset >>= 1;
+				__syncthreads();
+
+				if (tid < d)
+				{
+					ai = offset * i1 - 1;
+					bi = offset * i2 - 1;
+					ai += CONFLICT_FREE_OFFSET(ai);
+					bi += CONFLICT_FREE_OFFSET(bi);
+
+					int t = temp[ai];
+					temp[ai] = temp[bi];
+					temp[bi] += t;
+				}
+			}
+
+			__syncthreads();
+
+			odata[gidx1] = temp[lidx1];
+			odata[gidx2] = temp[lidx2];
+		}
+
+		__global__ void kernPerSegmentAdd(int segSize, int * __restrict__ odata, const int * __restrict__ blockSums)
+		{
+			int bid = blockIdx.x;
+			int tid = threadIdx.x;
+			int writeIdx1 = bid * segSize + 2 * tid;
+			int writeIdx2 = writeIdx1 + 1;
+
+			int sum = blockSums[bid];
+			odata[writeIdx1] += sum;
+			odata[writeIdx2] += sum;
+		}
+
+		void scanHelper(int segSize, int n, int *odata_dev, const int *idata_dev)
+		{
+			// determine segment size
+			int threadsPerBlock = segSize >> 1;
+			int numBlocks = NUM_SEG(n, segSize); // also numSegs
+
+			int *iblockSums = 0, *oblockSums = 0;
+			int segSizeNextLevel;
+			if (numBlocks > 1)
+			{
+				segSizeNextLevel = computeSegmentSize(numBlocks);
+				size_t offsetInDW = alignedSize(numBlocks * segSize * sizeof(int), 256) >> 2;
+				iblockSums = const_cast<int *>(idata_dev + offsetInDW);
+				oblockSums = odata_dev + offsetInDW;
+			}
+
+			kernScan << <numBlocks, threadsPerBlock, (segSize + CONFLICT_FREE_OFFSET(segSize - 1)) * sizeof(int) >> >(segSize, iblockSums, odata_dev, idata_dev);
+
+			if (numBlocks > 1)
+			{
+				scanHelper(segSizeNextLevel, numBlocks, oblockSums, iblockSums);
+				kernPerSegmentAdd << <numBlocks, threadsPerBlock >> >(segSize, odata_dev, oblockSums);
+			}
+		}
+#else
+		__global__ void kernScanUpSweepOneLevel(int offset, int numActiveThreads, int *iodata)
+		{
+			int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if (tid >= numActiveThreads)
+			{
+				return;
+			}
+
+			if (numActiveThreads == 1) // last level
+			{
+				iodata[2 * offset - 1] = 0;
+				return;
+			}
+
+			int i1 = 2 * tid + 1;
+			int i2 = i1 + 1;
+			int ai, bi;
+
+			ai = offset * i1 - 1;
+			bi = offset * i2 - 1;
+			iodata[bi] += iodata[ai];
+		}
+
+		__global__ void kernScanDownSweepOneLevel(int offset, int numActiveThreads, int *iodata)
+		{
+			int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if (tid >= numActiveThreads)
+			{
+				return;
+			}
+
+			int i1 = 2 * tid + 1;
+			int i2 = i1 + 1;
+			int ai, bi;
+
+			ai = offset * i1 - 1;
+			bi = offset * i2 - 1;
+			int t = iodata[ai];
+			iodata[ai] = iodata[bi];
+			iodata[bi] += t;
+		}
+#endif
+
+		/**
+		 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		 */
+#ifdef MEASURE_EXEC_TIME
+		float scan(int n, int *odata, const int *idata)
+		{
+			if (n <= 0 || !odata || !idata || odata == idata)
+			{
+				return -1;
+			}
+#else
+		void scan(int n, int *odata, const int *idata)
+		{
+			if (n <= 0 || !odata || !idata || odata == idata)
+			{
+				return;
+			}
+#endif
+#ifdef USING_SHARED_MEMORY
+			int segSize = computeSegmentSize(n);
+			const size_t kDevArraySizeInByte = computeActualMemSize<int>(n);
+			int *odata_dev = 0;
+			int *idata_dev = 0;
+
+			cudaMalloc(&odata_dev, kDevArraySizeInByte);
+			cudaMalloc(&idata_dev, kDevArraySizeInByte);
+			cudaMemset(idata_dev, 0, kDevArraySizeInByte);
+			cudaMemcpy(idata_dev, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+#ifdef MEASURE_EXEC_TIME
+			float execTime = 0.f;
+			cudaEvent_t start, stop;
+			cudaEventCreate(&start);
+			cudaEventCreate(&stop);
+			cudaEventRecord(start);
+#endif
+
+			scanHelper(segSize, n, odata_dev, idata_dev);
+
+#ifdef MEASURE_EXEC_TIME
+			cudaEventRecord(stop);
+			cudaEventSynchronize(stop);
+			cudaEventElapsedTime(&execTime, start, stop);
+			cudaEventDestroy(start);
+			cudaEventDestroy(stop);
+#endif
+
+			cudaMemcpy(odata, odata_dev, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(odata_dev);
+			cudaFree(idata_dev);
+			cudaDeviceSynchronize();
+
+#ifdef MEASURE_EXEC_TIME
+			return execTime;
+#endif
+#else
+			const int paddedSize = nearestMultipleOfTwo(n);
+			const size_t kDevArraySizeInByte = paddedSize * sizeof(int);
+			int *iodata_dev = 0;
+
+			cudaMalloc(&iodata_dev, kDevArraySizeInByte);
+			cudaMemset(iodata_dev, 0, kDevArraySizeInByte);
+			cudaMemcpy(iodata_dev, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+#ifdef MEASURE_EXEC_TIME
+			cudaEvent_t start, stop;
+			cudaEventCreate(&start);
+			cudaEventCreate(&stop);
+			cudaEventRecord(start);
+#endif
+
+			const int threadsPerBlock = 256;
+			const int numLevels = ilog2ceil(n);
+			int numActiveThreads = paddedSize >> 1;
+			int offset = 1;
+
+			// up sweep
+			for (int i = 0; i < numLevels; ++i)
+			{
+				int numBlocks = (numActiveThreads + threadsPerBlock - 1) / threadsPerBlock;
+				kernScanUpSweepOneLevel << <numBlocks, threadsPerBlock >> >(offset, numActiveThreads, iodata_dev);
+				numActiveThreads >>= 1;
+				offset *= 2;
+			}
+
+			// down sweep
+			numActiveThreads = 1;
+			for (int i = 0; i < numLevels; ++i)
+			{
+				offset >>= 1;
+				int numBlocks = (numActiveThreads + threadsPerBlock - 1) / threadsPerBlock;
+				kernScanDownSweepOneLevel << <numBlocks, threadsPerBlock >> >(offset, numActiveThreads, iodata_dev);
+				numActiveThreads <<= 1;
+			}
+
+#ifdef MEASURE_EXEC_TIME
+			cudaEventRecord(stop);
+			cudaEventSynchronize(stop);
+			float millisceconds = 0;
+			cudaEventElapsedTime(&millisceconds, start, stop);
+			cudaEventDestroy(start);
+			cudaEventDestroy(stop);
+#endif
+
+			cudaMemcpy(odata, iodata_dev, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(iodata_dev);
+			cudaDeviceSynchronize();
+
+#ifdef MEASURE_EXEC_TIME
+			return millisceconds;
+#endif
+#endif
+		}
+
+		/**
+		 * Performs stream compaction on idata, storing the result into odata.
+		 * All zeroes are discarded.
+		 *
+		 * @param n      The number of elements in idata.
+		 * @param odata  The array into which to store elements.
+		 * @param idata  The array of elements to compact.
+		 * @returns      The number of elements remaining after compaction.
+		 */
+#ifdef MEASURE_EXEC_TIME
+		int compact(int n, int *odata, const int *idata, float *pExecTime)
+#else
+		int compact(int n, int *odata, const int *idata)
+#endif
+		{
+			if (n <= 0 || !odata || !idata || odata == idata)
+			{
+				return -1;
+			}
+
+			using StreamCompaction::Common::kernMapToBoolean;
+			using StreamCompaction::Common::kernScatter;
+
+#ifdef MEASURE_EXEC_TIME
+			float &execTime = *pExecTime;
+
+			cudaEvent_t start, stop;
+			cudaEventCreate(&start);
+			cudaEventCreate(&stop);
+#endif
+
+			int *idata_dev = 0;
+			int *odata_dev = 0;
+			int *bools_dev = 0;
+			int *indices_dev = 0;
+
+			int segSize = computeSegmentSize(n);
+			const size_t kBoolsSizeInByte = computeActualMemSize<int>(n);
+			const size_t kIndicesSizeInByte = kBoolsSizeInByte;
+
+			cudaMalloc(&idata_dev, n * sizeof(int));
+			cudaMalloc(&bools_dev, kBoolsSizeInByte);
+			cudaMalloc(&indices_dev, kIndicesSizeInByte);
+
+			cudaMemcpy(idata_dev, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			cudaMemset(bools_dev, 0, kBoolsSizeInByte);
+
+			const int threadsPerBlock = 256;
+			int numBlocks = (n + threadsPerBlock - 1) / threadsPerBlock;
+
+#ifdef MEASURE_EXEC_TIME
+			cudaEventRecord(start);
+
+			kernMapToBoolean << <numBlocks, threadsPerBlock >> >(n, bools_dev, idata_dev);
+
+			scanHelper(segSize, n, indices_dev, bools_dev);
+
+			int numElemRemained;
+			cudaMemcpy(&numElemRemained, indices_dev + (n - 1), sizeof(int), cudaMemcpyDeviceToHost);
+			numElemRemained += idata[n - 1] ? 1 : 0;
+			cudaMalloc(&odata_dev, numElemRemained * sizeof(int));
+
+			kernScatter<<<numBlocks, threadsPerBlock>>>(n, odata_dev, idata_dev, bools_dev, indices_dev);
+
+			cudaEventRecord(stop);
+			cudaEventSynchronize(stop);
+			cudaEventElapsedTime(&execTime, start, stop);
+#else
+			kernMapToBoolean << <numBlocks, threadsPerBlock >> >(n, bools_dev, idata_dev);
+
+			scanHelper(segSize, n, indices_dev, bools_dev);
+
+			int numElemRemained;
+			cudaMemcpy(&numElemRemained, indices_dev + (n - 1), sizeof(int), cudaMemcpyDeviceToHost);
+			numElemRemained += idata[n - 1] ? 1 : 0;
+			cudaMalloc(&odata_dev, numElemRemained * sizeof(int));
+
+			kernScatter << <numBlocks, threadsPerBlock >> >(n, odata_dev, idata_dev, bools_dev, indices_dev);
+#endif
+
+			cudaMemcpy(odata, odata_dev, numElemRemained * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(idata_dev);
+			cudaFree(odata_dev);
+			cudaFree(bools_dev);
+			cudaFree(indices_dev);
+
+			return numElemRemained;
+		}
+
+	}
+
+	namespace Efficient4
+	{
+		// Naive scan
+		__device__ unsigned scan1Inclusive(unsigned idata, volatile unsigned *s_data)
+		{
+			int pos = threadIdx.x;
+			int size = blockDim.x;
+			
+			// Pad 0's before to avoid pos >= offset branching
+			s_data[pos] = 0;
+			pos += size;
+			s_data[pos] = idata;
+
+			for (int offset = 1; offset < size; offset <<= 1)
+			{
+				__syncthreads();
+				unsigned t = s_data[pos - offset];
+				__syncthreads();
+				s_data[pos] += t;
+			}
+
+			return s_data[pos];
+		}
+
+		__device__ unsigned scan1Exclusive(unsigned idata, volatile unsigned *s_data)
+		{
+			return scan1Inclusive(idata, s_data) - idata;
+		}
+
+		__device__ uint4 scan4Inclusive(uint4 idata4, volatile unsigned *s_data)
+		{
+			// Perform inclusive prefix sum on the four elements
+			idata4.y += idata4.x;
+			idata4.z += idata4.y;
+			idata4.w += idata4.z;
+
+			// Exclusively scan the 4-element sums on each block
+			unsigned oval = scan1Exclusive(idata4.w, s_data);
+
+			// Accumulate results
+			idata4.x += oval;
+			idata4.y += oval;
+			idata4.z += oval;
+			idata4.w += oval;
+
+			return idata4;
+		}
+
+		__device__ uint4 scan4Exclusive(uint4 idata4, volatile unsigned *s_data)
+		{
+			uint4 odata4 = scan4Inclusive(idata4, s_data);
+
+			// Make exclusive
+			odata4.x -= idata4.x;
+			odata4.y -= idata4.y;
+			odata4.z -= idata4.z;
+			odata4.w -= idata4.w;
+
+			return odata4;
+		}
+
+		/**
+		* Perform exclusive prefix sum on each thread block
+		* @n - size of d_dst and d_src
+		* @d_dst - destination device buffer
+		* @d_src - source device buffer
+		*/
+		__global__ void scanExclusiveShared(int n, uint4 *d_dst, uint4 *d_src)
+		{
+			extern __shared__ unsigned s_data[]; // size = 2 * blockDim.x
+
+			int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+			uint4 idata4;
+			if (tid < n) idata4 = d_src[tid];
+			uint4 odata4 = scan4Exclusive(idata4, s_data);
+			if (tid < n) d_dst[tid] = odata4;
+		}
+
+		/*
+		* @n - valid size of d_buf (not physical size)
+		* @preSegSize - segment size used in previous level scan
+		*/
+		__global__ void scanExclusiveShared2(int n, int preSegSize,
+			unsigned *d_obuf, unsigned *d_ibuf, unsigned *d_dst, unsigned *d_src)
+		{
+			extern __shared__ unsigned s_data[]; // size = 2 * blockDim.x
+
+			int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+			unsigned idata;
+			if (tid < n)
+			{
+				idata = d_dst[preSegSize - 1 + preSegSize * tid] +
+					d_src[preSegSize - 1 + preSegSize * tid];
+				d_ibuf[tid] = idata; // so that result can be made inclusive again
+			}
+
+			// Scan the sum of each block from last level scan
+			unsigned odata = scan1Exclusive(idata, s_data);
+
+			if (tid < n) d_obuf[tid] = odata;
+		}
+
+		/*
+		* @d_dst - size = blockDim.x * numBlocks
+		* @d_buf - size = numBlocks
+		*/
+		__global__ void perSegmentAdd(uint4 *d_dst, unsigned *d_buf)
+		{
+			__shared__ unsigned buf;
+
+			int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+			if (threadIdx.x == 0) buf = d_buf[blockIdx.x];
+
+			__syncthreads();
+
+			uint4 data4 = d_dst[tid];
+			data4.x += buf;
+			data4.y += buf;
+			data4.z += buf;
+			data4.w += buf;
+			d_dst[tid] = data4;
+		}
+
+		int computeSegmentSize(int n)
+		{
+			using Efficient::nearestMultipleOfTwo;
+			return n > (MAX_SEGMENT_SIZE >> 1) ? MAX_SEGMENT_SIZE :
+				std::max(nearestMultipleOfTwo(n), 4);
+		}
+
+		size_t computeActualMemSize(int n)
+		{
+			using Efficient::alignedSize;
+
+			const size_t kMemAlignmentInBytes = 256; // the alignment CUDA driver used
+
+			int segSize = computeSegmentSize(n);
+			int numSegs = NUM_SEG(n, segSize);
+			size_t total = alignedSize(numSegs * segSize * sizeof(unsigned), kMemAlignmentInBytes);
+
+			// Extra size to store intermediate results
+			while (numSegs > 1)
+			{
+				segSize = std::min(256, computeSegmentSize(numSegs));
+				numSegs = NUM_SEG(numSegs, segSize);
+				size_t extra = alignedSize(numSegs * segSize * sizeof(unsigned), kMemAlignmentInBytes);
+				total += extra;
+			}
+
+			return total;
+		}
+
+		void scanHelper(int n, unsigned *d_odata, unsigned *d_idata)
+		{
+			using Efficient::alignedSize;
+
+			int segSize = computeSegmentSize(n);
+			int numSegs = NUM_SEG(n, segSize);
+			int threadsPerBlock = segSize >> 2;
+
+			scanExclusiveShared<<<numSegs, threadsPerBlock, 2 * threadsPerBlock * sizeof(unsigned)>>>(
+				ROUND_SEG_SIZE(n, 4) >> 2,
+				reinterpret_cast<uint4 *>(d_odata),
+				reinterpret_cast<uint4 *>(d_idata));
+
+			std::vector<int> st;
+			unsigned *d_dst = d_odata;
+			unsigned *d_src = d_idata;
+
+			while (numSegs > 1)
+			{
+				st.push_back(numSegs);
+				st.push_back(segSize);
+
+				size_t offsetInDW = alignedSize(numSegs * segSize * sizeof(unsigned), 256) / 4;
+				unsigned *d_ibuf = d_src + offsetInDW;
+				unsigned *d_obuf = d_dst + offsetInDW;
+				int n = numSegs;
+				int preSegSize = segSize;
+
+				threadsPerBlock = segSize = std::min(256, computeSegmentSize(numSegs));
+				numSegs = NUM_SEG(numSegs, segSize);
+
+				scanExclusiveShared2<<<numSegs, threadsPerBlock, 2 * threadsPerBlock * sizeof(unsigned)>>>(
+					n, preSegSize, d_obuf, d_ibuf, d_dst, d_src);
+
+				d_src = d_ibuf;
+				d_dst = d_obuf;
+			}
+
+			while (!st.empty())
+			{
+				segSize = st.back();
+				st.pop_back();
+				numSegs = st.back();
+				st.pop_back();
+
+				size_t offsetInDW = alignedSize(numSegs * segSize * sizeof(unsigned), 256) / 4;
+				unsigned *d_buf = d_dst;
+				d_dst -= offsetInDW;
+
+				perSegmentAdd<<<numSegs, (segSize >> 2)>>>(reinterpret_cast<uint4 *>(d_dst), d_buf);
+			}
+		}
+
+		float scan(int n, unsigned *odata, const unsigned *idata)
+		{
+			if (n <= 0 || !odata || !idata)
+			{
+				throw std::exception("StreamCompaction::Efficient4::scan");
+			}
+
+			Common::MyCudaTimer timer;
+
+			size_t kDevBuffSize = computeActualMemSize(n);
+			unsigned *d_idata, *d_odata;
+			
+			cudaMalloc(&d_idata, kDevBuffSize);
+			cudaMalloc(&d_odata, kDevBuffSize);
+			cudaMemset(d_idata, 0, kDevBuffSize);
+			//cudaMemset(d_odata, 0, kDevBuffSize);
+			cudaMemcpy(d_idata, idata, n * sizeof(unsigned), cudaMemcpyHostToDevice);
+
+			timer.start();
+
+			scanHelper(n, d_odata, d_idata);
+
+			timer.stop();
+
+			cudaMemcpy(odata, d_odata, n * sizeof(unsigned), cudaMemcpyDeviceToHost);
+			cudaFree(d_idata);
+			cudaFree(d_odata);
+
+			return timer.duration();
+		}
+
+
+		int compact(int n, unsigned *odata, const unsigned *idata, float *pExecTime)
+		{
+			if (n <= 0 || !odata || !idata || odata == idata)
+			{
+				throw std::exception("StreamCompaction::Efficient4::compact");
+			}
+
+			using StreamCompaction::Common::kernMapToBoolean;
+			using StreamCompaction::Common::kernScatter;
+
+			Common::MyCudaTimer timer;
+
+			unsigned *idata_dev = 0;
+			unsigned *odata_dev = 0;
+			unsigned *bools_dev = 0;
+			unsigned *indices_dev = 0;
+
+			int segSize = computeSegmentSize(n);
+			const size_t kBoolsSizeInByte = computeActualMemSize(n);
+			const size_t kIndicesSizeInByte = kBoolsSizeInByte;
+
+			cudaMalloc(&idata_dev, n * sizeof(unsigned));
+			cudaMalloc(&bools_dev, kBoolsSizeInByte);
+			cudaMalloc(&indices_dev, kIndicesSizeInByte);
+
+			cudaMemcpy(idata_dev, idata, n * sizeof(unsigned), cudaMemcpyHostToDevice);
+			cudaMemset(bools_dev, 0, kBoolsSizeInByte);
+
+			const int threadsPerBlock = 256;
+			int numBlocks = (n + threadsPerBlock - 1) / threadsPerBlock;
+
+			timer.start();
+
+			kernMapToBoolean<<<numBlocks, threadsPerBlock>>>(
+				n,
+				reinterpret_cast<int *>(bools_dev),
+				reinterpret_cast<int *>(idata_dev));
+
+			scanHelper(n, indices_dev, bools_dev);
+
+			int numElemRemained;
+			cudaMemcpy(&numElemRemained, indices_dev + (n - 1), sizeof(unsigned), cudaMemcpyDeviceToHost);
+			numElemRemained += idata[n - 1] ? 1 : 0;
+			cudaMalloc(&odata_dev, numElemRemained * sizeof(unsigned));
+
+			kernScatter<<<numBlocks, threadsPerBlock>>>(
+				n,
+				reinterpret_cast<int *>(odata_dev),
+				reinterpret_cast<int *>(idata_dev),
+				reinterpret_cast<int *>(bools_dev),
+				reinterpret_cast<int *>(indices_dev));
+
+			timer.stop();
+			*pExecTime = timer.duration();
+
+			cudaMemcpy(odata, odata_dev, numElemRemained * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(idata_dev);
+			cudaFree(odata_dev);
+			cudaFree(bools_dev);
+			cudaFree(indices_dev);
+
+			return numElemRemained;
+		}
+	}
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 395ba10..6de9abf 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -1,9 +1,92 @@
 #pragma once
 
-namespace StreamCompaction {
-namespace Efficient {
-    void scan(int n, int *odata, const int *idata);
+#define USING_SHARED_MEMORY
 
-    int compact(int n, int *odata, const int *idata);
-}
+#ifdef USING_SHARED_MEMORY
+#define MAX_SEGMENT_SIZE 1024
+#define NUM_SEG(x, ss) (((x) + (ss) - 1) / (ss))
+#define ROUND_SEG_SIZE(x, ss) (NUM_SEG(x, (ss)) * (ss))
+
+#define NUM_BANKS 32
+#define LOG_NUM_BANKS 5
+#define CONFLICT_FREE_OFFSET(x) ((x) >> LOG_NUM_BANKS)
+#endif
+
+namespace StreamCompaction
+{
+	namespace Efficient
+	{
+#ifdef MEASURE_EXEC_TIME
+		float scan(int n, int *odata, const int *idata);
+
+		int compact(int n, int *odata, const int *idata, float *pExecTime);
+#else
+		void scan(int n, int *odata, const int *idata);
+
+		int compact(int n, int *odata, const int *idata);
+#endif
+
+		void scanHelper(int segSize, int n, int *odata_dev, const int *idata_dev);
+
+		inline int nearestMultipleOfTwo(int n)
+		{
+			int result = 1;
+			while (result < n) result <<= 1;
+			return result;
+		}
+
+		inline int computeSegmentSize(int n)
+		{
+			return n > (MAX_SEGMENT_SIZE >> 1) ? MAX_SEGMENT_SIZE : nearestMultipleOfTwo(n);
+		}
+
+		inline size_t alignedSize(size_t sizeInBytes, size_t alignmentInBytes)
+		{
+			return (sizeInBytes + alignmentInBytes - 1) / alignmentInBytes * alignmentInBytes;
+		}
+
+		/**
+		 * When data size is large, multiple passes are necessary to compute the
+		 * final result and extra device memory is required to store intermediate
+		 * results.
+		 *
+		 * Use the result from this function to cudaMalloc the input and output
+		 * buffers passed to scanHelper which assumes that I/O device buffers have
+		 * sufficient size.
+		 *
+		 * Assuming address start at a 256-byte boundary
+		 */
+		template<class T>
+		inline size_t computeActualMemSize(int n)
+		{
+			const size_t kMemAlignmentInBytes = 256; // the alignment CUDA driver used
+
+			int segSize = computeSegmentSize(n);
+			int numSegs = NUM_SEG(n, segSize);
+			size_t total = alignedSize(numSegs * segSize * sizeof(T), kMemAlignmentInBytes);
+
+			while (numSegs > 1)
+			{
+				segSize = computeSegmentSize(numSegs);
+				numSegs = NUM_SEG(numSegs, segSize);
+				size_t extra = alignedSize(numSegs * segSize * sizeof(T), kMemAlignmentInBytes);
+				total += extra;
+			}
+
+			return total;
+		}
+	}
+
+	namespace Efficient4
+	{
+		float scan(int n, unsigned *odata, const unsigned *idata);
+
+		int compact(int n, unsigned *odata, const unsigned *idata, float *pExecTime);
+
+		int computeSegmentSize(int n);
+
+		size_t computeActualMemSize(int n);
+
+		void scanHelper(int n, unsigned *d_odata, unsigned *d_idata);
+	}
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3d86b60..5c301ff 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -1,20 +1,103 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include "common.h"
+#define MEASURE_EXEC_TIME
 #include "naive.h"
 
 namespace StreamCompaction {
-namespace Naive {
+	namespace Naive {
 
-// TODO: __global__
+		__global__ void kernScanOneLevel(int stride, int n, int * __restrict__ odata, const int * __restrict__ idata)
+		{
+			unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-/**
- * Performs prefix-sum (aka scan) on idata, storing the result into odata.
- */
-void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
-}
+			if (tid >= n) return;
 
-}
+			if (tid >= stride)
+			{
+				odata[tid] = idata[tid - stride] + idata[tid];
+			}
+			else
+			{
+				odata[tid] = idata[tid];
+			}
+		}
+
+		__global__ void makeExclusive(int n, int * __restrict__ odata, int * __restrict__ idata)
+		{
+			unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+			if (tid >= n) return;
+
+			odata[tid] = tid == 0 ? 0 : idata[tid - 1];
+		}
+
+		/**
+		 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		 */
+#ifdef MEASURE_EXEC_TIME
+		float scan(int n, int *odata, const int *idata)
+		{
+			if (n <= 0 || !odata || !idata || odata == idata)
+			{
+				return -1;
+			}
+#else
+		void scan(int n, int *odata, const int *idata)
+		{
+			if (n <= 0 || !odata || !idata || odata == idata)
+			{
+				return;
+			}
+#endif
+			const size_t kArraySizeInByte = n * sizeof(int);
+			int *idata_dev = nullptr, *odata_dev = nullptr;
+
+			cudaMalloc(&idata_dev, kArraySizeInByte);
+			cudaMalloc(&odata_dev, kArraySizeInByte);
+			cudaMemcpy(idata_dev, idata, kArraySizeInByte, cudaMemcpyHostToDevice);
+
+#ifdef MEASURE_EXEC_TIME
+			cudaEvent_t start, stop;
+			cudaEventCreate(&start);
+			cudaEventCreate(&stop);
+			cudaEventRecord(start);
+#endif
+
+			const int numLevels = ilog2ceil(n);
+			const int threadsPerBlock = 256;
+			const int numBlocks = (n + threadsPerBlock - 1) / threadsPerBlock;
+			int stride = 1;
+
+			for (int i = 0; i < numLevels; ++i)
+			{
+				kernScanOneLevel << <numBlocks, threadsPerBlock >> >(stride, n, odata_dev, idata_dev);
+				stride <<= 1;
+				//swap
+				int *tmp = odata_dev;
+				odata_dev = idata_dev;
+				idata_dev = tmp;
+			}
+			makeExclusive << <numBlocks, threadsPerBlock >> >(n, odata_dev, idata_dev);
+
+#ifdef MEASURE_EXEC_TIME
+			cudaEventRecord(stop);
+			cudaEventSynchronize(stop);
+			float millisceconds = 0;
+			cudaEventElapsedTime(&millisceconds, start, stop);
+			cudaEventDestroy(start);
+			cudaEventDestroy(stop);
+#endif
+
+			cudaMemcpy(odata, odata_dev, kArraySizeInByte, cudaMemcpyDeviceToHost);
+			cudaFree(idata_dev);
+			cudaFree(odata_dev);
+			cudaDeviceSynchronize(); // make sure result is ready
+
+#ifdef MEASURE_EXEC_TIME
+			return millisceconds;
+#endif
+		}
+
+	}
 }
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h
index 21152d6..3ab5364 100644
--- a/stream_compaction/naive.h
+++ b/stream_compaction/naive.h
@@ -2,6 +2,10 @@
 
 namespace StreamCompaction {
 namespace Naive {
-    void scan(int n, int *odata, const int *idata);
+#ifdef MEASURE_EXEC_TIME
+    float scan(int n, int *odata, const int *idata);
+#else
+	void scan(int n, int *odata, const int *idata);
+#endif
 }
 }
diff --git a/stream_compaction/radix_sort.cu b/stream_compaction/radix_sort.cu
new file mode 100644
index 0000000..7cc94b1
--- /dev/null
+++ b/stream_compaction/radix_sort.cu
@@ -0,0 +1,201 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/scan.h>
+#include <thrust/sort.h>
+#include <exception>
+#include <string>
+
+#define MEASURE_EXEC_TIME
+#include <stream_compaction/efficient.h>
+#include "radix_sort.h"
+
+
+namespace std
+{
+	class InvalidArgument : public exception
+	{
+	public:
+		virtual const char *what() const throw()
+		{
+			return "One or more invalid arguments detected";
+		}
+	};
+}
+
+
+namespace ParallelRadixSort
+{
+	template <class T>
+	__global__ void kernClassify(uint32_t n, T mask,
+		uint32_t * __restrict__ notbools, uint32_t * __restrict__ bools, const T * __restrict__ idata)
+	{
+		uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+		if (tid >= n) return;
+
+		uint32_t t = static_cast<uint32_t>((idata[tid] & mask) == 0);
+		notbools[tid] = t;
+		bools[tid] = t ^ 0x1;
+	}
+
+	template <class T>
+	__global__ void kernScatter(uint32_t n,
+		uint32_t * __restrict__ nobools, uint32_t * __restrict__ noindices, uint32_t * __restrict__ yesindices,
+		T * __restrict__ odata, const T * __restrict__ idata)
+	{
+		uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+		if (tid >= n) return;
+
+		uint32_t isBitZero = nobools[tid];
+		uint32_t idx;
+
+		if (isBitZero) idx = noindices[tid]; else idx = yesindices[tid];
+
+		odata[idx] = idata[tid];
+	}
+
+#ifdef MEASURE_EXEC_TIME
+	template <class T>
+	float sort(int n, T *odata, const T *idata, T bitMask, bool lsb)
+#else
+	template <class T>
+	void sort(int n, T *odata, const T *idata, T bitMask, bool lsb)
+#endif
+	{
+		if (n <= 0 || !odata || !idata)
+		{
+			throw std::InvalidArgument();
+		}
+
+		int segSize = StreamCompaction::Efficient4::computeSegmentSize(2 * n);
+		const size_t kDevArraySizeInByte = StreamCompaction::Efficient4::computeActualMemSize(2 * n);
+
+		T *idata_dev = 0;
+		T *odata_dev = 0;
+		uint32_t *noyes_bools_dev = 0;
+		uint32_t *indices_dev = 0;
+
+		cudaMalloc(&idata_dev, n * sizeof(T));
+		cudaMalloc(&odata_dev, n * sizeof(T));
+		cudaMalloc(&noyes_bools_dev, kDevArraySizeInByte);
+		cudaMalloc(&indices_dev, kDevArraySizeInByte);
+
+		cudaMemcpy(idata_dev, idata, n * sizeof(T), cudaMemcpyHostToDevice);
+
+		const int threadsPerBlock = 256;
+		int numBlocks = (n + threadsPerBlock - 1) / threadsPerBlock;
+		int numBits = 8 * sizeof(T);
+		T mask = lsb ? 1 : (1 << (numBits - 1));
+
+#ifdef MEASURE_EXEC_TIME
+		float execTime = 0.f;
+		cudaEvent_t start, stop;
+		cudaEventCreate(&start);
+		cudaEventCreate(&stop);
+
+		cudaEventRecord(start);
+
+		for (int i = 0; i < numBits; ++i)
+		{
+			if (!(bitMask & mask)) continue; // do not consider this bit
+
+			kernClassify<<<numBlocks, threadsPerBlock>>>(n, mask, noyes_bools_dev, noyes_bools_dev + n, idata_dev);
+
+			StreamCompaction::Efficient4::scanHelper(2 * n, indices_dev, noyes_bools_dev);
+
+			kernScatter<<<numBlocks, threadsPerBlock>>>(n, noyes_bools_dev, indices_dev, indices_dev + n, odata_dev, idata_dev);
+
+			if (lsb) mask <<= 1; else mask >>= 1;
+
+			T *tmp = odata_dev;
+			odata_dev = idata_dev;
+			idata_dev = tmp;
+		}
+
+		cudaEventRecord(stop);
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&execTime, start, stop);
+
+		cudaEventDestroy(start);
+		cudaEventDestroy(stop);
+#else
+		for (int i = 0; i < numBits; ++i)
+		{
+			if (!(bitMask & mask)) continue; // do not consider this bit
+
+			kernClassify << <numBlocks, threadsPerBlock >> >(n, mask, noyes_bools_dev, noyes_bools_dev + n, idata_dev);
+
+			StreamCompaction::Efficient::scanHelper(segSize, 2 * n, indices_dev, noyes_bools_dev);
+
+			kernScatter << <numBlocks, threadsPerBlock >> >(n, noyes_bools_dev, indices_dev, indices_dev + n, odata_dev, idata_dev);
+
+			if (lsb) mask <<= 1; else mask >>= 1;
+
+			T *tmp = odata_dev;
+			odata_dev = idata_dev;
+			idata_dev = tmp;
+		}
+#endif
+
+		cudaMemcpy(odata, idata_dev, n * sizeof(T), cudaMemcpyDeviceToHost);
+		cudaFree(idata_dev);
+		cudaFree(odata_dev);
+		cudaFree(noyes_bools_dev);
+		cudaFree(indices_dev);
+
+#ifdef MEASURE_EXEC_TIME
+		return execTime;
+#endif
+	}
+
+	// Since template definition is not visible to users (main.obj in this case),
+	// we need to explicitly tell the compiler to generate all the template implementations
+	// that will be used later
+#ifdef MEASURE_EXEC_TIME
+	template float sort<uint32_t>(int n, uint32_t *odata, const uint32_t *idata, uint32_t bitMask, bool lsb);
+#else
+	template void sort<uint32_t>(int n, uint32_t *odata, const uint32_t *idata, uint32_t bitMask, bool lsb);
+#endif
+
+#ifdef MEASURE_EXEC_TIME
+	template <class T>
+	float thrustSort(int n, T *odata, const T *idata)
+	{
+		if (n <= 0 || !odata || !idata)
+		{
+			throw std::InvalidArgument();
+		}
+
+		T *iodata_dev = 0;
+
+		cudaMalloc(&iodata_dev, n * sizeof(T));
+		cudaMemcpy(iodata_dev, idata, n * sizeof(T), cudaMemcpyHostToDevice);
+
+		thrust::device_ptr<T> thrust_iodata(iodata_dev);
+
+		float execTime;
+		cudaEvent_t start, stop;
+		cudaEventCreate(&start);
+		cudaEventCreate(&stop);
+		cudaEventRecord(start);
+
+		thrust::stable_sort(thrust_iodata, thrust_iodata + n);
+
+		cudaEventRecord(stop);
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&execTime, start, stop);
+		cudaEventDestroy(start);
+		cudaEventDestroy(stop);
+
+		cudaMemcpy(odata, iodata_dev, n * sizeof(T), cudaMemcpyDeviceToHost);
+		cudaFree(iodata_dev);
+
+		return execTime;
+	}
+
+	template float thrustSort(int n, uint32_t *odata, const uint32_t *idata);
+#endif
+}
\ No newline at end of file
diff --git a/stream_compaction/radix_sort.h b/stream_compaction/radix_sort.h
new file mode 100644
index 0000000..8eea74d
--- /dev/null
+++ b/stream_compaction/radix_sort.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <stdint.h>
+
+
+namespace ParallelRadixSort
+{
+	// Only work on unsigned integral types
+#ifdef MEASURE_EXEC_TIME
+	template <class T>
+	float sort(int n, T *odata, const T *idata, T bitMask, bool lsb = true);
+
+	template <class T>
+	float thrustSort(int n, T *odata, const T *idata);
+#else
+	template <class T>
+	void sort(int n, T *odata, const T *idata, T bitMask, bool lsb = true);
+#endif
+}
\ No newline at end of file
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index d8dbb32..4c9cbaa 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -4,19 +4,70 @@
 #include <thrust/host_vector.h>
 #include <thrust/scan.h>
 #include "common.h"
+#define MEASURE_EXEC_TIME
 #include "thrust.h"
 
 namespace StreamCompaction {
-namespace Thrust {
-
-/**
- * Performs prefix-sum (aka scan) on idata, storing the result into odata.
- */
-void scan(int n, int *odata, const int *idata) {
-    // TODO use `thrust::exclusive_scan`
-    // example: for device_vectors dv_in and dv_out:
-    // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
-}
+	namespace Thrust {
 
-}
+		/**
+		 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		 */
+#ifdef MEASURE_EXEC_TIME
+		float scan(int n, int *odata, const int *idata)
+		{
+			if (n <= 0 || !odata || !idata || odata == idata)
+			{
+				return -1;
+			}
+#else
+		void scan(int n, int *odata, const int *idata)
+		{
+			if (n <= 0 || !odata || !idata || odata == idata)
+			{
+				return;
+			}
+#endif
+			// TODO use `thrust::exclusive_scan`
+			// example: for device_vectors dv_in and dv_out:
+			// thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+			const size_t kArraySizeInByte = n * sizeof(int);
+			int *idata_dev = nullptr, *odata_dev = nullptr;
+
+			cudaMalloc(&idata_dev, kArraySizeInByte);
+			cudaMalloc(&odata_dev, kArraySizeInByte);
+			cudaMemcpy(idata_dev, idata, kArraySizeInByte, cudaMemcpyHostToDevice);
+
+			thrust::device_ptr<int> thrust_idata(idata_dev);
+			thrust::device_ptr<int> thrust_odata(odata_dev);
+
+#ifdef MEASURE_EXEC_TIME
+			cudaEvent_t start, stop;
+			cudaEventCreate(&start);
+			cudaEventCreate(&stop);
+			cudaEventRecord(start);
+#endif
+
+			thrust::exclusive_scan(thrust_idata, thrust_idata + n, thrust_odata);
+
+#ifdef MEASURE_EXEC_TIME
+			cudaEventRecord(stop);
+			cudaEventSynchronize(stop);
+			float millisceconds = 0;
+			cudaEventElapsedTime(&millisceconds, start, stop);
+			cudaEventDestroy(start);
+			cudaEventDestroy(stop);
+#endif
+
+			cudaMemcpy(odata, odata_dev, kArraySizeInByte, cudaMemcpyDeviceToHost);
+			cudaFree(idata_dev);
+			cudaFree(odata_dev);
+			cudaDeviceSynchronize(); // make sure result is ready
+
+#ifdef MEASURE_EXEC_TIME
+			return millisceconds;
+#endif
+		}
+
+	}
 }
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h
index 06707f3..baf6d97 100644
--- a/stream_compaction/thrust.h
+++ b/stream_compaction/thrust.h
@@ -2,6 +2,10 @@
 
 namespace StreamCompaction {
 namespace Thrust {
-    void scan(int n, int *odata, const int *idata);
+#ifdef MEASURE_EXEC_TIME
+    float scan(int n, int *odata, const int *idata);
+#else
+	void scan(int n, int *odata, const int *idata);
+#endif
 }
 }