diff --git a/CMakeLists.txt b/CMakeLists.txt
index dff84f8..275f3a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,10 +97,3 @@ target_link_libraries(${CMAKE_PROJECT_NAME}
     ${CORELIBS}
     )
 
-add_custom_command(
-    TARGET ${CMAKE_PROJECT_NAME}
-    POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_directory
-        ${CMAKE_SOURCE_DIR}/shaders
-        ${CMAKE_BINARY_DIR}/shaders
-    )
diff --git a/README.md b/README.md
index 41b91f0..0a39774 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,93 @@
 CUDA Rasterizer
 ===============
 
-[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md)
-
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
 
-* (TODO) YOUR NAME HERE
-	* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yu Sun 
+* [LinkedIn](https://www.linkedin.com/in/yusun3/)
+* Tested on: Tested on: Windows 10 , i7-6700HQ CPU @ 2.60GHz × 8 , GeForce GTX 960M/PCIe/SSE2, 7.7GB Memory (Personal Laptop)
+
+## Introduction
+In this project, a basic rasterized graphic pipeline is implemented using cuda. The pipeline includes
+vertex shading, primitive assembly, rasterization, fragment shading, and a framebuffer. The special features I implement 
+for this project includes 
+
+```
+* Rasterization for lines and points
+* Back-face culling
+* UV texture mapping with bilinear texture filtering and perspective correct texture coordinates.
+* Super-sample Anti-Aliasing 
+```
+
+Forgive my briefness for the README, I'll add more things if I get more time later.
+
+***Rasterization for Points, Lines and Triangles*** 
+
+Rasterization for points is straightforward by computing the corresponding pixel index and color. 
+
+Rasterization for lines is approximated using the Bresenham Algorithm since a naive approach would lead to artifact 
+coming from fixed grid resolution, and rounding float pixel locations to integer pixel locations
+
+Rasterization for traingles is achieved by using barycentric coordinates 
+
+Milk Car       | Duck | Flower
+:-------------------------:|:-------------------------:|:-------------------------:
+![](img/milkcar.gif)  |  ![](img/duck.gif)  |  ![](img/flower.gif)
+
+***Texture Mapping and Prospective Correction with Bilinear Interpolation***
+
+The texture mapping can be achieved by using uv coordinates that warp 2D textures onto 3D mesh. Techniques used to make 
+the texture look better include bilinear interpolation and perspective correction
+using the depth information. 
+
+A comparision of texture mapping with and without perspective correction is shown below.
+
+Scene without Perspective Correction      | Scene with Perspective Correction
+:-------------------------:|:-------------------------:
+<img src="img/check.PNG" width="300"/> |  <img src="img/check_corrected.PNG" width="300"/> 
+
+Bilinear interpolation is basically a techique used to prevent aliasing effect and make the resulting image
+looks more natural and smooth by taking the color of surrounding pixels into account while generating the final
+color for a specific pixel. Since the effect isn't that obvious, I picked one image that I found through Google Image
+to demonstrate the effect.
+
+![](img/interpolation.png) 
+
+***Back-face Culling***
+
+Back-face culling is intended to reduce the amount of computation by eliminating the pixels that cannot be captured from 
+the camera. However, while implementing it I found that I didn't see a significant speed up, and it actually creates some 
+funny effect.
+
+Scene without Back-face Culling     | Scene with Back-face Culling
+:-------------------------:|:-------------------------:
+![](img/no_culling.PNG)  |  ![](img/culling.PNG) 
+
+The things that are culled out can be seen below
+
+![](img/culled.gif) 
+
+***Super-Sampled Antialiasing***
+
+By super-sampling, one is essentially making more grids and creating higher resolution. This is a sacrifice on memory to 
+give more details to the display. The difference can be seen below.
+
+SSAA Factor = 1     | SSAA Factor= 4
+:-------------------------:|:-------------------------:
+![](img/ssaa=1.gif)  |  ![](img/ssaa=4.gif) 
+
+
+## Performance Analysis 
+
+Below is a graph demonstrating the amount of time spent on each stage of the graphics pipeline for different type of scenes.
+Without much surprise, it can be seen that most time of the computation is spent on primitive rasterization. 
 
-### (TODO: Your README)
+![](img/compute.png) 
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+Notice how computation increase while we move closer to the rendered object due to the increase amount of checks we need to perform.
 
+![](img/z.png) 
+  
 
 ### Credits
 
diff --git a/img/check.PNG b/img/check.PNG
new file mode 100755
index 0000000..3d59dac
Binary files /dev/null and b/img/check.PNG differ
diff --git a/img/check_corrected.PNG b/img/check_corrected.PNG
new file mode 100755
index 0000000..c7bbeab
Binary files /dev/null and b/img/check_corrected.PNG differ
diff --git a/img/compute.png b/img/compute.png
new file mode 100644
index 0000000..38e9f3a
Binary files /dev/null and b/img/compute.png differ
diff --git a/img/culled.gif b/img/culled.gif
new file mode 100755
index 0000000..8b1d258
Binary files /dev/null and b/img/culled.gif differ
diff --git a/img/culling.PNG b/img/culling.PNG
new file mode 100755
index 0000000..96e27a8
Binary files /dev/null and b/img/culling.PNG differ
diff --git a/img/duck.gif b/img/duck.gif
new file mode 100755
index 0000000..49422fc
Binary files /dev/null and b/img/duck.gif differ
diff --git a/img/flower.gif b/img/flower.gif
new file mode 100755
index 0000000..bfb70d6
Binary files /dev/null and b/img/flower.gif differ
diff --git a/img/interpolation.png b/img/interpolation.png
new file mode 100644
index 0000000..d5ef0eb
Binary files /dev/null and b/img/interpolation.png differ
diff --git a/img/milkcar.gif b/img/milkcar.gif
new file mode 100755
index 0000000..c6570ac
Binary files /dev/null and b/img/milkcar.gif differ
diff --git a/img/no_culling.PNG b/img/no_culling.PNG
new file mode 100755
index 0000000..0f8957d
Binary files /dev/null and b/img/no_culling.PNG differ
diff --git a/img/ssaa=1.gif b/img/ssaa=1.gif
new file mode 100755
index 0000000..67154e6
Binary files /dev/null and b/img/ssaa=1.gif differ
diff --git a/img/ssaa=4.gif b/img/ssaa=4.gif
new file mode 100755
index 0000000..6dd78d3
Binary files /dev/null and b/img/ssaa=4.gif differ
diff --git a/img/z.png b/img/z.png
new file mode 100644
index 0000000..733c109
Binary files /dev/null and b/img/z.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 7986959..679f2cc 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,10 +1,10 @@
 /**
- * @file      main.cpp
- * @brief     Main file for CUDA rasterizer. Handles CUDA-GL interop for display.
- * @authors   Skeleton code: Yining Karl Li, Kai Ninomiya, Shuai Shao (Shrek)
- * @date      2012-2016
- * @copyright University of Pennsylvania
- */
+* @file      main.cpp
+* @brief     Main file for CUDA rasterizer. Handles CUDA-GL interop for display.
+* @authors   Skeleton code: Yining Karl Li, Kai Ninomiya, Shuai Shao (Shrek)
+* @date      2012-2016
+* @copyright University of Pennsylvania
+*/
 
 
 
@@ -12,42 +12,52 @@
 
 #define STB_IMAGE_IMPLEMENTATION
 #define TINYGLTF_LOADER_IMPLEMENTATION
+
 #include <util/tiny_gltf_loader.h>
 
+//-------------------------------
+//---------RUNTIME STUFF---------
+//-------------------------------
+float scale = 1.0f;
+float x_trans = 0.0f, y_trans = 0.0f, z_trans = -10.0f;
+float x_angle = 0.0f, y_angle = 0.0f;
+int primitive_type = 0;
+
 //-------------------------------
 //-------------MAIN--------------
 //-------------------------------
 
+
 int main(int argc, char **argv) {
     if (argc != 2) {
         cout << "Usage: [gltf file]. Press Enter to exit" << endl;
-		getchar();
+        getchar();
         return 0;
     }
 
-	tinygltf::Scene scene;
-	tinygltf::TinyGLTFLoader loader;
-	std::string err;
-	std::string input_filename(argv[1]);
-	std::string ext = getFilePathExtension(input_filename);
-
-	bool ret = false;
-	if (ext.compare("glb") == 0) {
-		// assume binary glTF.
-		ret = loader.LoadBinaryFromFile(&scene, &err, input_filename.c_str());
-	} else {
-		// assume ascii glTF.
-		ret = loader.LoadASCIIFromFile(&scene, &err, input_filename.c_str());
-	}
+    tinygltf::Scene scene;
+    tinygltf::TinyGLTFLoader loader;
+    std::string err;
+    std::string input_filename(argv[1]);
+    std::string ext = getFilePathExtension(input_filename);
+
+    bool ret = false;
+    if (ext.compare("glb") == 0) {
+        // assume binary glTF.
+        ret = loader.LoadBinaryFromFile(&scene, &err, input_filename.c_str());
+    } else {
+        // assume ascii glTF.
+        ret = loader.LoadASCIIFromFile(&scene, &err, input_filename.c_str());
+    }
 
-	if (!err.empty()) {
-		printf("Err: %s\n", err.c_str());
-	}
+    if (!err.empty()) {
+        printf("Err: %s\n", err.c_str());
+    }
 
-	if (!ret) {
-		printf("Failed to parse glTF\n");
-		return -1;
-	}
+    if (!ret) {
+        printf("Failed to parse glTF\n");
+        return -1;
+    }
 
 
     frame = 0;
@@ -68,7 +78,7 @@ void mainLoop() {
         glfwPollEvents();
         runCuda();
 
-        time_t seconds2 = time (NULL);
+        time_t seconds2 = time(NULL);
 
         if (seconds2 - seconds >= 1) {
 
@@ -77,7 +87,8 @@ void mainLoop() {
             seconds = seconds2;
         }
 
-        string title = "CIS565 Rasterizer | " + utilityCore::convertIntToString((int)fps) + " FPS";
+        string title = "CIS565 Rasterizer | " + utilityCore::convertIntToString((int) fps) + " FPS" +
+                       "; Z = " + utilityCore::convertIntToString((int) z_trans);
         glfwSetWindowTitle(window, title.c_str());
 
         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
@@ -86,52 +97,48 @@ void mainLoop() {
         glClear(GL_COLOR_BUFFER_BIT);
 
         // VAO, shader program, and texture already bound
-        glDrawElements(GL_TRIANGLES, 6,  GL_UNSIGNED_SHORT, 0);
+        glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_SHORT, 0);
         glfwSwapBuffers(window);
     }
     glfwDestroyWindow(window);
     glfwTerminate();
 }
 
-//-------------------------------
-//---------RUNTIME STUFF---------
-//-------------------------------
-float scale = 1.0f;
-float x_trans = 0.0f, y_trans = 0.0f, z_trans = -10.0f;
-float x_angle = 0.0f, y_angle = 0.0f;
 void runCuda() {
     // Map OpenGL buffer object for writing from CUDA on a single GPU
     // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
     dptr = NULL;
 
-	glm::mat4 P = glm::frustum<float>(-scale * ((float)width) / ((float)height),
-		scale * ((float)width / (float)height),
-		-scale, scale, 1.0, 1000.0);
+    glm::mat4 P = glm::frustum<float>(-scale * ((float) width) / ((float) height),
+                                      scale * ((float) width / (float) height),
+                                      -scale, scale, 1.0, 1000.0);
 
-	glm::mat4 V = glm::mat4(1.0f);
+    glm::mat4 V = glm::mat4(1.0f);
 
-	glm::mat4 M =
-		glm::translate(glm::vec3(x_trans, y_trans, z_trans))
-		* glm::rotate(x_angle, glm::vec3(1.0f, 0.0f, 0.0f))
-		* glm::rotate(y_angle, glm::vec3(0.0f, 1.0f, 0.0f));
+    glm::mat4 M =
+            glm::translate(glm::vec3(x_trans, y_trans, z_trans))
+            * glm::rotate(x_angle, glm::vec3(1.0f, 0.0f, 0.0f))
+            * glm::rotate(y_angle, glm::vec3(0.0f, 1.0f, 0.0f));
 
-	glm::mat3 MV_normal = glm::transpose(glm::inverse(glm::mat3(V) * glm::mat3(M)));
-	glm::mat4 MV = V * M;
-	glm::mat4 MVP = P * MV;
+    glm::mat3 MV_normal = glm::transpose(glm::inverse(glm::mat3(V) * glm::mat3(M)));
+    glm::mat4 MV = V * M;
+    glm::mat4 MVP = P * MV;
 
-    cudaGLMapBufferObject((void **)&dptr, pbo);
-	rasterize(dptr, MVP, MV, MV_normal);
+    cudaGLMapBufferObject((void **) &dptr, pbo);
+    rasterize(dptr, MVP, MV, MV_normal, primitive_type);
     cudaGLUnmapBufferObject(pbo);
 
     frame++;
     fpstracker++;
+    y_angle += 0.04f;
+
 }
 
 //-------------------------------
 //----------SETUP STUFF----------
 //-------------------------------
 
-bool init(const tinygltf::Scene & scene) {
+bool init(const tinygltf::Scene &scene) {
     glfwSetErrorCallback(errorCallback);
 
     if (!glfwInit()) {
@@ -158,30 +165,32 @@ bool init(const tinygltf::Scene & scene) {
     initVAO();
     initTextures();
     initCuda();
-	initPBO();
-
-	// Mouse Control Callbacks
-	glfwSetMouseButtonCallback(window, mouseButtonCallback);
-	glfwSetCursorPosCallback(window, mouseMotionCallback);
-	glfwSetScrollCallback(window, mouseWheelCallback);
-
-	{
-		std::map<std::string, std::vector<std::string> >::const_iterator it(
-			scene.scenes.begin());
-		std::map<std::string, std::vector<std::string> >::const_iterator itEnd(
-			scene.scenes.end());
-
-		for (; it != itEnd; it++) {
-			for (size_t i = 0; i < it->second.size(); i++) {
-				std::cout << it->second[i]
-					<< ((i != (it->second.size() - 1)) ? ", " : "");
-			}
-			std::cout << " ] " << std::endl;
-		}
-	}
+    initPBO();
+
+    // Mouse Control Callbacks
+    glfwSetMouseButtonCallback(window, mouseButtonCallback);
+    glfwSetCursorPosCallback(window, mouseMotionCallback);
+    glfwSetScrollCallback(window, mouseWheelCallback);
+
+    {
+        std::map < std::string, std::vector < std::string > > ::const_iterator
+        it(
+                scene.scenes.begin());
+        std::map < std::string, std::vector < std::string > > ::const_iterator
+        itEnd(
+                scene.scenes.end());
+
+        for (; it != itEnd; it++) {
+            for (size_t i = 0; i < it->second.size(); i++) {
+                std::cout << it->second[i]
+                          << ((i != (it->second.size() - 1)) ? ", " : "");
+            }
+            std::cout << " ] " << std::endl;
+        }
+    }
 
 
-	rasterizeSetBuffers(scene);
+    rasterizeSetBuffers(scene);
 
     GLuint passthroughProgram;
     passthroughProgram = initShader();
@@ -225,38 +234,38 @@ void initTextures() {
     glBindTexture(GL_TEXTURE_2D, displayImage);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
     glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA,
-                  GL_UNSIGNED_BYTE, NULL);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_BGRA,
+                 GL_UNSIGNED_BYTE, NULL);
 }
 
 void initVAO(void) {
     GLfloat vertices[] = {
-        -1.0f, -1.0f,
-        1.0f, -1.0f,
-        1.0f,  1.0f,
-        -1.0f,  1.0f,
+            -1.0f, -1.0f,
+            1.0f, -1.0f,
+            1.0f, 1.0f,
+            -1.0f, 1.0f,
     };
 
     GLfloat texcoords[] = {
-        1.0f, 1.0f,
-        0.0f, 1.0f,
-        0.0f, 0.0f,
-        1.0f, 0.0f
+            1.0f, 1.0f,
+            0.0f, 1.0f,
+            0.0f, 0.0f,
+            1.0f, 0.0f
     };
 
-    GLushort indices[] = { 0, 1, 3, 3, 1, 2 };
+    GLushort indices[] = {0, 1, 3, 3, 1, 2};
 
     GLuint vertexBufferObjID[3];
     glGenBuffers(3, vertexBufferObjID);
 
     glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[0]);
     glBufferData(GL_ARRAY_BUFFER, sizeof(vertices), vertices, GL_STATIC_DRAW);
-    glVertexAttribPointer((GLuint)positionLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
+    glVertexAttribPointer((GLuint) positionLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
     glEnableVertexAttribArray(positionLocation);
 
     glBindBuffer(GL_ARRAY_BUFFER, vertexBufferObjID[1]);
     glBufferData(GL_ARRAY_BUFFER, sizeof(texcoords), texcoords, GL_STATIC_DRAW);
-    glVertexAttribPointer((GLuint)texcoordsLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
+    glVertexAttribPointer((GLuint) texcoordsLocation, 2, GL_FLOAT, GL_FALSE, 0, 0);
     glEnableVertexAttribArray(texcoordsLocation);
 
     glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, vertexBufferObjID[2]);
@@ -265,7 +274,7 @@ void initVAO(void) {
 
 
 GLuint initShader() {
-    const char *attribLocations[] = { "Position", "Tex" };
+    const char *attribLocations[] = {"Position", "Tex"};
     GLuint program = glslUtility::createDefaultProgram(attribLocations, 2);
     GLint location;
 
@@ -298,13 +307,13 @@ void deletePBO(GLuint *pbo) {
         glBindBuffer(GL_ARRAY_BUFFER, *pbo);
         glDeleteBuffers(1, pbo);
 
-        *pbo = (GLuint)NULL;
+        *pbo = (GLuint) NULL;
     }
 }
 
 void deleteTexture(GLuint *tex) {
     glDeleteTextures(1, tex);
-    *tex = (GLuint)NULL;
+    *tex = (GLuint) NULL;
 }
 
 void shut_down(int return_code) {
@@ -327,6 +336,9 @@ void errorCallback(int error, const char *description) {
 void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods) {
     if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) {
         glfwSetWindowShouldClose(window, GL_TRUE);
+    } else if (key == GLFW_KEY_N && action == GLFW_RELEASE) {
+        primitive_type++;
+        if (primitive_type > 2) primitive_type = 0;
     }
 }
 
@@ -334,9 +346,9 @@ void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods
 //----- util -----------------
 //----------------------------
 static std::string getFilePathExtension(const std::string &FileName) {
-	if (FileName.find_last_of(".") != std::string::npos)
-		return FileName.substr(FileName.find_last_of(".") + 1);
-	return "";
+    if (FileName.find_last_of(".") != std::string::npos)
+        return FileName.substr(FileName.find_last_of(".") + 1);
+    return "";
 }
 
 
@@ -345,56 +357,48 @@ static std::string getFilePathExtension(const std::string &FileName) {
 //---- Mouse control ----------
 //-----------------------------
 
-enum ControlState { NONE = 0, ROTATE, TRANSLATE };
+enum ControlState {
+    NONE = 0, ROTATE, TRANSLATE
+};
 ControlState mouseState = NONE;
-void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods)
-{
-	if (action == GLFW_PRESS)
-	{
-		if (button == GLFW_MOUSE_BUTTON_LEFT)
-		{
-			mouseState = ROTATE;
-		}
-		else if (button == GLFW_MOUSE_BUTTON_RIGHT)
-		{
-			mouseState = TRANSLATE;
-		}
-
-	}
-	else if (action == GLFW_RELEASE)
-	{
-		mouseState = NONE;
-	}
+
+void mouseButtonCallback(GLFWwindow *window, int button, int action, int mods) {
+    if (action == GLFW_PRESS) {
+        if (button == GLFW_MOUSE_BUTTON_LEFT) {
+            mouseState = ROTATE;
+        } else if (button == GLFW_MOUSE_BUTTON_RIGHT) {
+            mouseState = TRANSLATE;
+        }
+
+    } else if (action == GLFW_RELEASE) {
+        mouseState = NONE;
+    }
 }
 
-double lastx = (double)width / 2;
-double lasty = (double)height / 2;
-void mouseMotionCallback(GLFWwindow* window, double xpos, double ypos)
-{
-	const double s_r = 0.01;
-	const double s_t = 0.01;
-
-	double diffx = xpos - lastx;
-	double diffy = ypos - lasty;
-	lastx = xpos;
-	lasty = ypos;
-
-	if (mouseState == ROTATE)
-	{
-		//rotate
-		x_angle += (float)s_r * diffy;
-		y_angle += (float)s_r * diffx;
-	}
-	else if (mouseState == TRANSLATE)
-	{
-		//translate
-		x_trans += (float)(s_t * diffx);
-		y_trans += (float)(-s_t * diffy);
-	}
+double lastx = (double) width / 2;
+double lasty = (double) height / 2;
+
+void mouseMotionCallback(GLFWwindow *window, double xpos, double ypos) {
+    const double s_r = 0.01;
+    const double s_t = 0.01;
+
+    double diffx = xpos - lastx;
+    double diffy = ypos - lasty;
+    lastx = xpos;
+    lasty = ypos;
+
+    if (mouseState == ROTATE) {
+        //rotate
+        x_angle += (float) s_r * diffy;
+        y_angle += (float) s_r * diffx;
+    } else if (mouseState == TRANSLATE) {
+        //translate
+        x_trans += (float) (s_t * diffx);
+        y_trans += (float) (-s_t * diffy);
+    }
 }
 
-void mouseWheelCallback(GLFWwindow* window, double xoffset, double yoffset)
-{
-	const double s = 1.0;	// sensitivity
-	z_trans += (float)(s * yoffset);
+void mouseWheelCallback(GLFWwindow *window, double xoffset, double yoffset) {
+    const double s = 1.0;    // sensitivity
+    z_trans += (float) (s * yoffset);
 }
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 1262a09..4b3cdb8 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -1,10 +1,10 @@
 /**
- * @file      rasterize.cu
- * @brief     CUDA-accelerated rasterization pipeline.
- * @authors   Skeleton code: Yining Karl Li, Kai Ninomiya, Shuai Shao (Shrek)
- * @date      2012-2016
- * @copyright University of Pennsylvania & STUDENT
- */
+* @file      rasterize.cu
+* @brief     CUDA-accelerated rasterization pipeline.
+* @authors   Skeleton code: Yining Karl Li, Kai Ninomiya, Shuai Shao (Shrek)
+* @date      2012-2016
+* @copyright University of Pennsylvania & STUDENT
+*/
 
 #include <cmath>
 #include <cstdio>
@@ -17,104 +17,137 @@
 #include "rasterize.h"
 #include <glm/gtc/quaternion.hpp>
 #include <glm/gtc/matrix_transform.hpp>
+#include <thrust/remove.h>
+#include <thrust/execution_policy.h>
+
+#define SSAA_RES 1
+#define CORRECT_PROSPECTIVE 1
+#define BACKFACE_CULLING 0
+
+//#define LINE 0
+//#define POINT 0
+//#define TRIANGLE 1
+
+#ifndef imax
+#define imax(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef imin
+#define imin(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+#define SCREENGAMMA 2.2
+
+template<typename T>
+__host__ __device__
+
+void swap(T &a, T &b) {
+    T tmp(a);
+    a = b;
+    b = tmp;
+}
 
 namespace {
 
-	typedef unsigned short VertexIndex;
-	typedef glm::vec3 VertexAttributePosition;
-	typedef glm::vec3 VertexAttributeNormal;
-	typedef glm::vec2 VertexAttributeTexcoord;
-	typedef unsigned char TextureData;
-
-	typedef unsigned char BufferByte;
-
-	enum PrimitiveType{
-		Point = 1,
-		Line = 2,
-		Triangle = 3
-	};
-
-	struct VertexOut {
-		glm::vec4 pos;
-
-		// TODO: add new attributes to your VertexOut
-		// The attributes listed below might be useful, 
-		// but always feel free to modify on your own
-
-		 glm::vec3 eyePos;	// eye space position used for shading
-		 glm::vec3 eyeNor;	// eye space normal used for shading, cuz normal will go wrong after perspective transformation
-		// glm::vec3 col;
-		 glm::vec2 texcoord0;
-		 TextureData* dev_diffuseTex = NULL;
-		// int texWidth, texHeight;
-		// ...
-	};
-
-	struct Primitive {
-		PrimitiveType primitiveType = Triangle;	// C++ 11 init
-		VertexOut v[3];
-	};
-
-	struct Fragment {
-		glm::vec3 color;
-
-		// TODO: add new attributes to your Fragment
-		// The attributes listed below might be useful, 
-		// but always feel free to modify on your own
-
-		// glm::vec3 eyePos;	// eye space position used for shading
-		// glm::vec3 eyeNor;
-		// VertexAttributeTexcoord texcoord0;
-		// TextureData* dev_diffuseTex;
-		// ...
-	};
-
-	struct PrimitiveDevBufPointers {
-		int primitiveMode;	//from tinygltfloader macro
-		PrimitiveType primitiveType;
-		int numPrimitives;
-		int numIndices;
-		int numVertices;
-
-		// Vertex In, const after loaded
-		VertexIndex* dev_indices;
-		VertexAttributePosition* dev_position;
-		VertexAttributeNormal* dev_normal;
-		VertexAttributeTexcoord* dev_texcoord0;
-
-		// Materials, add more attributes when needed
-		TextureData* dev_diffuseTex;
-		int diffuseTexWidth;
-		int diffuseTexHeight;
-		// TextureData* dev_specularTex;
-		// TextureData* dev_normalTex;
-		// ...
-
-		// Vertex Out, vertex used for rasterization, this is changing every frame
-		VertexOut* dev_verticesOut;
-
-		// TODO: add more attributes when needed
-	};
+    typedef unsigned short VertexIndex;
+    typedef glm::vec3 VertexAttributePosition;
+    typedef glm::vec3 VertexAttributeNormal;
+    typedef glm::vec2 VertexAttributeTexcoord;
+    typedef unsigned char TextureData;
+
+    typedef unsigned char BufferByte;
+
+    enum PrimitiveType {
+        Point = 1,
+        Line = 2,
+        Triangle = 3
+    };
+
+    struct VertexOut {
+        glm::vec4 pos;
+
+        // TODO: add new attributes to your VertexOut
+        // The attributes listed below might be useful,
+        // but always feel free to modify on your own
+
+        glm::vec3 eyePos;    // eye space position used for shading
+        glm::vec3 eyeNor;    // eye space normal used for shading, cuz normal will go wrong after perspective transformation
+        glm::vec3 col;
+        glm::vec2 texcoord0;
+        TextureData *dev_diffuseTex = NULL;
+        int texWidth, texHeight;
+        // ...
+    };
+
+    struct Primitive {
+        PrimitiveType primitiveType = Triangle;    // C++ 11 init
+        VertexOut v[3];
+        bool cull = false;
+    };
+
+    struct Fragment {
+        glm::vec3 color;
+
+        // TODO: add new attributes to your Fragment
+        // The attributes listed below might be useful,
+        // but always feel free to modify on your own
+
+        glm::vec3 eyePos;    // eye space position used for shading
+        glm::vec3 eyeNor;
+        VertexAttributeTexcoord texcoord0;
+        TextureData *dev_diffuseTex;
+        int texWidth, texHeight;
+        // ...
+    };
+
+    struct PrimitiveDevBufPointers {
+        int primitiveMode;    //from tinygltfloader macro
+        PrimitiveType primitiveType;
+        int numPrimitives;
+        int numIndices;
+        int numVertices;
+
+        // Vertex In, const after loaded
+        VertexIndex *dev_indices;
+        VertexAttributePosition *dev_position;
+        VertexAttributeNormal *dev_normal;
+        VertexAttributeTexcoord *dev_texcoord0;
+
+        // Materials, add more attributes when needed
+        TextureData *dev_diffuseTex;
+        int diffuseTexWidth;
+        int diffuseTexHeight;
+        // TextureData* dev_specularTex;
+        // TextureData* dev_normalTex;
+        // ...
+
+        // Vertex Out, vertex used for rasterization, this is changing every frame
+        VertexOut *dev_verticesOut;
+
+        // TODO: add more attributes when needed
+    };
 
 }
 
-static std::map<std::string, std::vector<PrimitiveDevBufPointers>> mesh2PrimitivesMap;
+static std::map <std::string, std::vector<PrimitiveDevBufPointers>> mesh2PrimitivesMap;
 
 
 static int width = 0;
 static int height = 0;
-
+static int screen_width = 0;
+static int screen_height = 0;
 static int totalNumPrimitives = 0;
+
 static Primitive *dev_primitives = NULL;
 static Fragment *dev_fragmentBuffer = NULL;
 static glm::vec3 *dev_framebuffer = NULL;
 
-static int * dev_depth = NULL;	// you might need this buffer when doing depth test
+static int *dev_depth = NULL;    // you might need this buffer when doing depth test
 
+static cudaEvent_t start, stop;
 /**
- * Kernel that writes the image to the OpenGL PBO directly.
- */
-__global__ 
+* Kernel that writes the image to the OpenGL PBO directly.
+*/
+__global__
 void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
     int x = (blockIdx.x * blockDim.x) + threadIdx.x;
     int y = (blockIdx.y * blockDim.y) + threadIdx.y;
@@ -122,9 +155,16 @@ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
 
     if (x < w && y < h) {
         glm::vec3 color;
-        color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
-        color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
-        color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
+        for (int i = 0; i < SSAA_RES; i++) {
+            for (int j = 0; j < SSAA_RES; j++) {
+                int ss_index = x * SSAA_RES + i + (y * SSAA_RES + j) * w * SSAA_RES;
+                color.x += glm::clamp(image[ss_index].x, 0.0f, 1.0f) * 255.f;
+                color.y += glm::clamp(image[ss_index].y, 0.0f, 1.0f) * 255.f;
+                color.z += glm::clamp(image[ss_index].z, 0.0f, 1.0f) * 255.f;
+            }
+        }
+        color /= (float)(SSAA_RES * SSAA_RES);
+
         // Each thread writes one pixel location in the texture (textel)
         pbo[index].w = 0;
         pbo[index].x = color.x;
@@ -133,53 +173,113 @@ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
     }
 }
 
-/** 
+__host__ __device__
+
+glm::vec3 getRGBColor(const int idx, const TextureData *texture) {
+    return glm::vec3(texture[idx] / 255.0f, texture[idx + 1] / 255.0f, texture[idx + 2] / 255.0f);
+}
+
+
+__host__ __device__
+
+glm::vec3 textureMapping(const int w, const int h, const glm::vec2 uv, const TextureData *texture) {
+    // bilinear interpolation wikipedia
+    float _x = w * uv.x;
+    float _y = h * uv.y;
+    int x = (int)_x;
+    int y = (int)_y;
+
+    glm::vec3 q00 = getRGBColor(3 * (x + y * w), texture);
+    glm::vec3 q10 = getRGBColor(3 * (x + 1 + y * w), texture);
+    glm::vec3 q01 = getRGBColor(3 * (x + (y + 1) * w), texture);
+    glm::vec3 q11 = getRGBColor(3 * (x + 1 + (y + 1) * w), texture);
+
+    float dx = _x - x;
+    float dy = _y - y;
+
+    return (q00 * (1.f - dx) * (1.f - dy) + q10 * (1.f - dy) * dx + q01 * (1.f - dx) * dy + q11 * dx * dy);
+}
+
+
+/**
 * Writes fragment colors to the framebuffer
 */
 __global__
-void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
+void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer, int primitive_type) {
     int x = (blockIdx.x * blockDim.x) + threadIdx.x;
     int y = (blockIdx.y * blockDim.y) + threadIdx.y;
     int index = x + (y * w);
 
     if (x < w && y < h) {
-        framebuffer[index] = fragmentBuffer[index].color;
-
-		// TODO: add your fragment shader code here
 
+        // TODO: add your fragment shader code here
+        // texture mapping
+        if (primitive_type == 0) {
+            if (fragmentBuffer[index].dev_diffuseTex != NULL) {
+                glm::vec3 diffuseTexture = textureMapping(fragmentBuffer[index].texWidth, fragmentBuffer[index].texHeight,
+                                                          fragmentBuffer[index].texcoord0,
+                                                          fragmentBuffer[index].dev_diffuseTex);
+                // wikipedia blin phong shading model
+                glm::vec3 lightPos = glm::vec3(50.f);
+                glm::vec3 lightColor = glm::vec3(1.0f);
+                glm::vec3 ambientColor = glm::vec3(0.9f);
+                glm::vec3 specular = glm::vec3(0.9f);
+                float lightPower = 1.2;
+
+                glm::vec3 lightDir = glm::normalize(lightPos - fragmentBuffer[index].eyePos);
+                glm::vec3 eyeDir = glm::normalize(-fragmentBuffer[index].eyePos);
+                float lambertian = imax(glm::dot(fragmentBuffer[index].eyeNor, lightDir), 0);
+
+                specular *= pow(imax(glm::dot(glm::normalize(lightDir + eyeDir), fragmentBuffer[index].eyeNor), 0), 16.0f);
+
+                glm::vec3 color =
+                        ambientColor * 0.1f * lightColor + (diffuseTexture * lambertian + specular) * lightColor * lightPower;
+
+                color = pow(color, glm::vec3(1.f / SCREENGAMMA));
+
+                framebuffer[index] = color;
+            }
+            else {
+                framebuffer[index] = fragmentBuffer[index].color;
+            }
+
+        }
+        else {
+            framebuffer[index] = fragmentBuffer[index].color;
+        }
     }
 }
 
 /**
- * Called once at the beginning of the program to allocate memory.
- */
+* Called once at the beginning of the program to allocate memory.
+*/
 void rasterizeInit(int w, int h) {
-    width = w;
-    height = h;
-	cudaFree(dev_fragmentBuffer);
-	cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment));
-	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
+    width = w * SSAA_RES;
+    height = h * SSAA_RES;
+    screen_width = w;
+    screen_height = h;
+    cudaFree(dev_fragmentBuffer);
+    cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment));
+    cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
     cudaFree(dev_framebuffer);
-    cudaMalloc(&dev_framebuffer,   width * height * sizeof(glm::vec3));
+    cudaMalloc(&dev_framebuffer, width * height * sizeof(glm::vec3));
     cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
-    
-	cudaFree(dev_depth);
-	cudaMalloc(&dev_depth, width * height * sizeof(int));
 
-	checkCUDAError("rasterizeInit");
+    cudaFree(dev_depth);
+    cudaMalloc(&dev_depth, width * height * sizeof(int));
+
+    checkCUDAError("rasterizeInit");
 }
 
 __global__
-void initDepth(int w, int h, int * depth)
-{
-	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-	if (x < w && y < h)
-	{
-		int index = x + (y * w);
-		depth[index] = INT_MAX;
-	}
+void initDepth(int w, int h, int *depth) {
+    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+    if (x < w && y < h) {
+        int index = x + (y * w);
+        depth[index] = INT_MAX;
+    }
 }
 
 
@@ -187,590 +287,867 @@ void initDepth(int w, int h, int * depth)
 * kern function with support for stride to sometimes replace cudaMemcpy
 * One thread is responsible for copying one component
 */
-__global__ 
-void _deviceBufferCopy(int N, BufferByte* dev_dst, const BufferByte* dev_src, int n, int byteStride, int byteOffset, int componentTypeByteSize) {
-	
-	// Attribute (vec3 position)
-	// component (3 * float)
-	// byte (4 * byte)
-
-	// id of component
-	int i = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-	if (i < N) {
-		int count = i / n;
-		int offset = i - count * n;	// which component of the attribute
-
-		for (int j = 0; j < componentTypeByteSize; j++) {
-			
-			dev_dst[count * componentTypeByteSize * n 
-				+ offset * componentTypeByteSize 
-				+ j]
-
-				= 
-
-			dev_src[byteOffset 
-				+ count * (byteStride == 0 ? componentTypeByteSize * n : byteStride) 
-				+ offset * componentTypeByteSize 
-				+ j];
-		}
-	}
-	
+__global__
+void _deviceBufferCopy(int N, BufferByte *dev_dst, const BufferByte *dev_src, int n, int byteStride, int byteOffset,
+                       int componentTypeByteSize) {
+
+    // Attribute (vec3 position)
+    // component (3 * float)
+    // byte (4 * byte)
+
+    // id of component
+    int i = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    if (i < N) {
+        int count = i / n;
+        int offset = i - count * n;    // which component of the attribute
+
+        for (int j = 0; j < componentTypeByteSize; j++) {
+
+            dev_dst[count * componentTypeByteSize * n
+                    + offset * componentTypeByteSize
+                    + j]
+
+                    =
+
+                    dev_src[byteOffset
+                            + count * (byteStride == 0 ? componentTypeByteSize * n : byteStride)
+                            + offset * componentTypeByteSize
+                            + j];
+        }
+    }
+
 
 }
 
 __global__
 void _nodeMatrixTransform(
-	int numVertices,
-	VertexAttributePosition* position,
-	VertexAttributeNormal* normal,
-	glm::mat4 MV, glm::mat3 MV_normal) {
-
-	// vertex id
-	int vid = (blockIdx.x * blockDim.x) + threadIdx.x;
-	if (vid < numVertices) {
-		position[vid] = glm::vec3(MV * glm::vec4(position[vid], 1.0f));
-		normal[vid] = glm::normalize(MV_normal * normal[vid]);
-	}
+        int numVertices,
+        VertexAttributePosition *position,
+        VertexAttributeNormal *normal,
+        glm::mat4 MV, glm::mat3 MV_normal) {
+
+    // vertex id
+    int vid = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (vid < numVertices) {
+        position[vid] = glm::vec3(MV * glm::vec4(position[vid], 1.0f));
+        normal[vid] = glm::normalize(MV_normal * normal[vid]);
+    }
 }
 
-glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node & n) {
-	
-	glm::mat4 curMatrix(1.0);
-
-	const std::vector<double> &m = n.matrix;
-	if (m.size() > 0) {
-		// matrix, copy it
-
-		for (int i = 0; i < 4; i++) {
-			for (int j = 0; j < 4; j++) {
-				curMatrix[i][j] = (float)m.at(4 * i + j);
-			}
-		}
-	} else {
-		// no matrix, use rotation, scale, translation
-
-		if (n.translation.size() > 0) {
-			curMatrix[3][0] = n.translation[0];
-			curMatrix[3][1] = n.translation[1];
-			curMatrix[3][2] = n.translation[2];
-		}
-
-		if (n.rotation.size() > 0) {
-			glm::mat4 R;
-			glm::quat q;
-			q[0] = n.rotation[0];
-			q[1] = n.rotation[1];
-			q[2] = n.rotation[2];
-
-			R = glm::mat4_cast(q);
-			curMatrix = curMatrix * R;
-		}
-
-		if (n.scale.size() > 0) {
-			curMatrix = curMatrix * glm::scale(glm::vec3(n.scale[0], n.scale[1], n.scale[2]));
-		}
-	}
-
-	return curMatrix;
-}
+glm::mat4 getMatrixFromNodeMatrixVector(const tinygltf::Node &n) {
 
-void traverseNode (
-	std::map<std::string, glm::mat4> & n2m,
-	const tinygltf::Scene & scene,
-	const std::string & nodeString,
-	const glm::mat4 & parentMatrix
-	) 
-{
-	const tinygltf::Node & n = scene.nodes.at(nodeString);
-	glm::mat4 M = parentMatrix * getMatrixFromNodeMatrixVector(n);
-	n2m.insert(std::pair<std::string, glm::mat4>(nodeString, M));
-
-	auto it = n.children.begin();
-	auto itEnd = n.children.end();
-
-	for (; it != itEnd; ++it) {
-		traverseNode(n2m, scene, *it, M);
-	}
-}
+    glm::mat4 curMatrix(1.0);
 
-void rasterizeSetBuffers(const tinygltf::Scene & scene) {
+    const std::vector<double> &m = n.matrix;
+    if (m.size() > 0) {
+        // matrix, copy it
 
-	totalNumPrimitives = 0;
+        for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < 4; j++) {
+                curMatrix[i][j] = (float)m.at(4 * i + j);
+            }
+        }
+    }
+    else {
+        // no matrix, use rotation, scale, translation
+
+        if (n.translation.size() > 0) {
+            curMatrix[3][0] = n.translation[0];
+            curMatrix[3][1] = n.translation[1];
+            curMatrix[3][2] = n.translation[2];
+        }
+
+        if (n.rotation.size() > 0) {
+            glm::mat4 R;
+            glm::quat q;
+            q[0] = n.rotation[0];
+            q[1] = n.rotation[1];
+            q[2] = n.rotation[2];
+
+            R = glm::mat4_cast(q);
+            curMatrix = curMatrix * R;
+        }
+
+        if (n.scale.size() > 0) {
+            curMatrix = curMatrix * glm::scale(glm::vec3(n.scale[0], n.scale[1], n.scale[2]));
+        }
+    }
 
-	std::map<std::string, BufferByte*> bufferViewDevPointers;
+    return curMatrix;
+}
 
-	// 1. copy all `bufferViews` to device memory
-	{
-		std::map<std::string, tinygltf::BufferView>::const_iterator it(
-			scene.bufferViews.begin());
-		std::map<std::string, tinygltf::BufferView>::const_iterator itEnd(
-			scene.bufferViews.end());
+void traverseNode(
+        std::map <std::string, glm::mat4> &n2m,
+        const tinygltf::Scene &scene,
+        const std::string &nodeString,
+        const glm::mat4 &parentMatrix
+) {
+    const tinygltf::Node &n = scene.nodes.at(nodeString);
+    glm::mat4 M = parentMatrix * getMatrixFromNodeMatrixVector(n);
+    n2m.insert(std::pair<std::string, glm::mat4>(nodeString, M));
+
+    auto it = n.children.begin();
+    auto itEnd = n.children.end();
+
+    for (; it != itEnd; ++it) {
+        traverseNode(n2m, scene, *it, M);
+    }
+}
 
-		for (; it != itEnd; it++) {
-			const std::string key = it->first;
-			const tinygltf::BufferView &bufferView = it->second;
-			if (bufferView.target == 0) {
-				continue; // Unsupported bufferView.
-			}
+void rasterizeSetBuffers(const tinygltf::Scene &scene) {
 
-			const tinygltf::Buffer &buffer = scene.buffers.at(bufferView.buffer);
+    totalNumPrimitives = 0;
 
-			BufferByte* dev_bufferView;
-			cudaMalloc(&dev_bufferView, bufferView.byteLength);
-			cudaMemcpy(dev_bufferView, &buffer.data.front() + bufferView.byteOffset, bufferView.byteLength, cudaMemcpyHostToDevice);
+    std::map < std::string, BufferByte * > bufferViewDevPointers;
 
-			checkCUDAError("Set BufferView Device Mem");
+    // 1. copy all `bufferViews` to device memory
+    {
+        std::map<std::string, tinygltf::BufferView>::const_iterator it(
+                scene.bufferViews.begin());
+        std::map<std::string, tinygltf::BufferView>::const_iterator itEnd(
+                scene.bufferViews.end());
 
-			bufferViewDevPointers.insert(std::make_pair(key, dev_bufferView));
+        for (; it != itEnd; it++) {
+            const std::string key = it->first;
+            const tinygltf::BufferView &bufferView = it->second;
+            if (bufferView.target == 0) {
+                continue; // Unsupported bufferView.
+            }
 
-		}
-	}
+            const tinygltf::Buffer &buffer = scene.buffers.at(bufferView.buffer);
 
+            BufferByte *dev_bufferView;
+            cudaMalloc(&dev_bufferView, bufferView.byteLength);
+            cudaMemcpy(dev_bufferView, &buffer.data.front() + bufferView.byteOffset, bufferView.byteLength,
+                       cudaMemcpyHostToDevice);
 
+            checkCUDAError("Set BufferView Device Mem");
 
-	// 2. for each mesh: 
-	//		for each primitive: 
-	//			build device buffer of indices, materail, and each attributes
-	//			and store these pointers in a map
-	{
+            bufferViewDevPointers.insert(std::make_pair(key, dev_bufferView));
 
-		std::map<std::string, glm::mat4> nodeString2Matrix;
-		auto rootNodeNamesList = scene.scenes.at(scene.defaultScene);
-
-		{
-			auto it = rootNodeNamesList.begin();
-			auto itEnd = rootNodeNamesList.end();
-			for (; it != itEnd; ++it) {
-				traverseNode(nodeString2Matrix, scene, *it, glm::mat4(1.0f));
-			}
-		}
+        }
+    }
 
 
-		// parse through node to access mesh
 
-		auto itNode = nodeString2Matrix.begin();
-		auto itEndNode = nodeString2Matrix.end();
-		for (; itNode != itEndNode; ++itNode) {
-
-			const tinygltf::Node & N = scene.nodes.at(itNode->first);
-			const glm::mat4 & matrix = itNode->second;
-			const glm::mat3 & matrixNormal = glm::transpose(glm::inverse(glm::mat3(matrix)));
-
-			auto itMeshName = N.meshes.begin();
-			auto itEndMeshName = N.meshes.end();
-
-			for (; itMeshName != itEndMeshName; ++itMeshName) {
-
-				const tinygltf::Mesh & mesh = scene.meshes.at(*itMeshName);
-
-				auto res = mesh2PrimitivesMap.insert(std::pair<std::string, std::vector<PrimitiveDevBufPointers>>(mesh.name, std::vector<PrimitiveDevBufPointers>()));
-				std::vector<PrimitiveDevBufPointers> & primitiveVector = (res.first)->second;
-
-				// for each primitive
-				for (size_t i = 0; i < mesh.primitives.size(); i++) {
-					const tinygltf::Primitive &primitive = mesh.primitives[i];
-
-					if (primitive.indices.empty())
-						return;
-
-					// TODO: add new attributes for your PrimitiveDevBufPointers when you add new attributes
-					VertexIndex* dev_indices = NULL;
-					VertexAttributePosition* dev_position = NULL;
-					VertexAttributeNormal* dev_normal = NULL;
-					VertexAttributeTexcoord* dev_texcoord0 = NULL;
-
-					// ----------Indices-------------
-
-					const tinygltf::Accessor &indexAccessor = scene.accessors.at(primitive.indices);
-					const tinygltf::BufferView &bufferView = scene.bufferViews.at(indexAccessor.bufferView);
-					BufferByte* dev_bufferView = bufferViewDevPointers.at(indexAccessor.bufferView);
-
-					// assume type is SCALAR for indices
-					int n = 1;
-					int numIndices = indexAccessor.count;
-					int componentTypeByteSize = sizeof(VertexIndex);
-					int byteLength = numIndices * n * componentTypeByteSize;
-
-					dim3 numThreadsPerBlock(128);
-					dim3 numBlocks((numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
-					cudaMalloc(&dev_indices, byteLength);
-					_deviceBufferCopy << <numBlocks, numThreadsPerBlock >> > (
-						numIndices,
-						(BufferByte*)dev_indices,
-						dev_bufferView,
-						n,
-						indexAccessor.byteStride,
-						indexAccessor.byteOffset,
-						componentTypeByteSize);
-
-
-					checkCUDAError("Set Index Buffer");
-
-
-					// ---------Primitive Info-------
-
-					// Warning: LINE_STRIP is not supported in tinygltfloader
-					int numPrimitives;
-					PrimitiveType primitiveType;
-					switch (primitive.mode) {
-					case TINYGLTF_MODE_TRIANGLES:
-						primitiveType = PrimitiveType::Triangle;
-						numPrimitives = numIndices / 3;
-						break;
-					case TINYGLTF_MODE_TRIANGLE_STRIP:
-						primitiveType = PrimitiveType::Triangle;
-						numPrimitives = numIndices - 2;
-						break;
-					case TINYGLTF_MODE_TRIANGLE_FAN:
-						primitiveType = PrimitiveType::Triangle;
-						numPrimitives = numIndices - 2;
-						break;
-					case TINYGLTF_MODE_LINE:
-						primitiveType = PrimitiveType::Line;
-						numPrimitives = numIndices / 2;
-						break;
-					case TINYGLTF_MODE_LINE_LOOP:
-						primitiveType = PrimitiveType::Line;
-						numPrimitives = numIndices + 1;
-						break;
-					case TINYGLTF_MODE_POINTS:
-						primitiveType = PrimitiveType::Point;
-						numPrimitives = numIndices;
-						break;
-					default:
-						// output error
-						break;
-					};
-
-
-					// ----------Attributes-------------
-
-					auto it(primitive.attributes.begin());
-					auto itEnd(primitive.attributes.end());
-
-					int numVertices = 0;
-					// for each attribute
-					for (; it != itEnd; it++) {
-						const tinygltf::Accessor &accessor = scene.accessors.at(it->second);
-						const tinygltf::BufferView &bufferView = scene.bufferViews.at(accessor.bufferView);
-
-						int n = 1;
-						if (accessor.type == TINYGLTF_TYPE_SCALAR) {
-							n = 1;
-						}
-						else if (accessor.type == TINYGLTF_TYPE_VEC2) {
-							n = 2;
-						}
-						else if (accessor.type == TINYGLTF_TYPE_VEC3) {
-							n = 3;
-						}
-						else if (accessor.type == TINYGLTF_TYPE_VEC4) {
-							n = 4;
-						}
+    // 2. for each mesh:
+    //		for each primitive:
+    //			build device buffer of indices, materail, and each attributes
+    //			and store these pointers in a map
+    {
+
+        std::map <std::string, glm::mat4> nodeString2Matrix;
+        auto rootNodeNamesList = scene.scenes.at(scene.defaultScene);
+
+        {
+            auto it = rootNodeNamesList.begin();
+            auto itEnd = rootNodeNamesList.end();
+            for (; it != itEnd; ++it) {
+                traverseNode(nodeString2Matrix, scene, *it, glm::mat4(1.0f));
+            }
+        }
+
+
+        // parse through node to access mesh
+
+        auto itNode = nodeString2Matrix.begin();
+        auto itEndNode = nodeString2Matrix.end();
+        for (; itNode != itEndNode; ++itNode) {
+
+            const tinygltf::Node &N = scene.nodes.at(itNode->first);
+            const glm::mat4 &matrix = itNode->second;
+            const glm::mat3 &matrixNormal = glm::transpose(glm::inverse(glm::mat3(matrix)));
+
+            auto itMeshName = N.meshes.begin();
+            auto itEndMeshName = N.meshes.end();
+
+            for (; itMeshName != itEndMeshName; ++itMeshName) {
+
+                const tinygltf::Mesh &mesh = scene.meshes.at(*itMeshName);
+
+                auto res = mesh2PrimitivesMap.insert(std::pair < std::string, std::vector < PrimitiveDevBufPointers
+                        >> (mesh.name, std::vector<PrimitiveDevBufPointers>()));
+                std::vector <PrimitiveDevBufPointers> &primitiveVector = (res.first)->second;
+
+                // for each primitive
+                for (size_t i = 0; i < mesh.primitives.size(); i++) {
+                    const tinygltf::Primitive &primitive = mesh.primitives[i];
+
+                    if (primitive.indices.empty())
+                        return;
+
+                    // TODO: add new attributes for your PrimitiveDevBufPointers when you add new attributes
+                    VertexIndex *dev_indices = NULL;
+                    VertexAttributePosition *dev_position = NULL;
+                    VertexAttributeNormal *dev_normal = NULL;
+                    VertexAttributeTexcoord *dev_texcoord0 = NULL;
+
+                    // ----------Indices-------------
+
+                    const tinygltf::Accessor &indexAccessor = scene.accessors.at(primitive.indices);
+                    const tinygltf::BufferView &bufferView = scene.bufferViews.at(indexAccessor.bufferView);
+                    BufferByte *dev_bufferView = bufferViewDevPointers.at(indexAccessor.bufferView);
+
+                    // assume type is SCALAR for indices
+                    int n = 1;
+                    int numIndices = indexAccessor.count;
+                    int componentTypeByteSize = sizeof(VertexIndex);
+                    int byteLength = numIndices * n * componentTypeByteSize;
+
+                    dim3 numThreadsPerBlock(128);
+                    dim3 numBlocks((numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+                    cudaMalloc(&dev_indices, byteLength);
+                    _deviceBufferCopy << < numBlocks, numThreadsPerBlock >> > (
+                            numIndices,
+                                    (BufferByte *)dev_indices,
+                                    dev_bufferView,
+                                    n,
+                                    indexAccessor.byteStride,
+                                    indexAccessor.byteOffset,
+                                    componentTypeByteSize);
+
+
+                    checkCUDAError("Set Index Buffer");
+
+
+                    // ---------Primitive Info-------
+
+                    // Warning: LINE_STRIP is not supported in tinygltfloader
+                    int numPrimitives;
+                    PrimitiveType primitiveType;
+                    switch (primitive.mode) {
+                        case TINYGLTF_MODE_TRIANGLES:
+                            primitiveType = PrimitiveType::Triangle;
+                            numPrimitives = numIndices / 3;
+                            break;
+                        case TINYGLTF_MODE_TRIANGLE_STRIP:
+                            primitiveType = PrimitiveType::Triangle;
+                            numPrimitives = numIndices - 2;
+                            break;
+                        case TINYGLTF_MODE_TRIANGLE_FAN:
+                            primitiveType = PrimitiveType::Triangle;
+                            numPrimitives = numIndices - 2;
+                            break;
+                        case TINYGLTF_MODE_LINE:
+                            primitiveType = PrimitiveType::Line;
+                            numPrimitives = numIndices / 2;
+                            break;
+                        case TINYGLTF_MODE_LINE_LOOP:
+                            primitiveType = PrimitiveType::Line;
+                            numPrimitives = numIndices + 1;
+                            break;
+                        case TINYGLTF_MODE_POINTS:
+                            primitiveType = PrimitiveType::Point;
+                            numPrimitives = numIndices;
+                            break;
+                        default:
+                            // output error
+                            break;
+                    };
+
+
+                    // ----------Attributes-------------
+
+                    auto it(primitive.attributes.begin());
+                    auto itEnd(primitive.attributes.end());
+
+                    int numVertices = 0;
+                    // for each attribute
+                    for (; it != itEnd; it++) {
+                        const tinygltf::Accessor &accessor = scene.accessors.at(it->second);
+                        const tinygltf::BufferView &bufferView = scene.bufferViews.at(accessor.bufferView);
+
+                        int n = 1;
+                        if (accessor.type == TINYGLTF_TYPE_SCALAR) {
+                            n = 1;
+                        }
+                        else if (accessor.type == TINYGLTF_TYPE_VEC2) {
+                            n = 2;
+                        }
+                        else if (accessor.type == TINYGLTF_TYPE_VEC3) {
+                            n = 3;
+                        }
+                        else if (accessor.type == TINYGLTF_TYPE_VEC4) {
+                            n = 4;
+                        }
+
+                        BufferByte *dev_bufferView = bufferViewDevPointers.at(accessor.bufferView);
+                        BufferByte **dev_attribute = NULL;
+
+                        numVertices = accessor.count;
+                        int componentTypeByteSize;
+
+                        // Note: since the type of our attribute array (dev_position) is static (float32)
+                        // We assume the glTF model attribute type are 5126(FLOAT) here
+
+                        if (it->first.compare("POSITION") == 0) {
+                            componentTypeByteSize = sizeof(VertexAttributePosition) / n;
+                            dev_attribute = (BufferByte **)&dev_position;
+                        }
+                        else if (it->first.compare("NORMAL") == 0) {
+                            componentTypeByteSize = sizeof(VertexAttributeNormal) / n;
+                            dev_attribute = (BufferByte **)&dev_normal;
+                        }
+                        else if (it->first.compare("TEXCOORD_0") == 0) {
+                            componentTypeByteSize = sizeof(VertexAttributeTexcoord) / n;
+                            dev_attribute = (BufferByte **)&dev_texcoord0;
+                        }
+
+                        std::cout << accessor.bufferView << "  -  " << it->second << "  -  " << it->first << '\n';
+
+                        dim3 numThreadsPerBlock(128);
+                        dim3 numBlocks((n * numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+                        int byteLength = numVertices * n * componentTypeByteSize;
+                        cudaMalloc(dev_attribute, byteLength);
+
+                        _deviceBufferCopy << < numBlocks, numThreadsPerBlock >> > (
+                                n * numVertices,
+                                        *dev_attribute,
+                                        dev_bufferView,
+                                        n,
+                                        accessor.byteStride,
+                                        accessor.byteOffset,
+                                        componentTypeByteSize);
+
+                        std::string msg = "Set Attribute Buffer: " + it->first;
+                        checkCUDAError(msg.c_str());
+                    }
+
+                    // malloc for VertexOut
+                    VertexOut *dev_vertexOut;
+                    cudaMalloc(&dev_vertexOut, numVertices * sizeof(VertexOut));
+                    checkCUDAError("Malloc VertexOut Buffer");
+
+                    // ----------Materials-------------
+
+                    // You can only worry about this part once you started to
+                    // implement textures for your rasterizer
+                    TextureData *dev_diffuseTex = NULL;
+                    int diffuseTexWidth = 0;
+                    int diffuseTexHeight = 0;
+                    if (!primitive.material.empty()) {
+                        const tinygltf::Material &mat = scene.materials.at(primitive.material);
+                        printf("material.name = %s\n", mat.name.c_str());
+
+                        if (mat.values.find("diffuse") != mat.values.end()) {
+                            std::string diffuseTexName = mat.values.at("diffuse").string_value;
+                            if (scene.textures.find(diffuseTexName) != scene.textures.end()) {
+                                const tinygltf::Texture &tex = scene.textures.at(diffuseTexName);
+                                if (scene.images.find(tex.source) != scene.images.end()) {
+                                    const tinygltf::Image &image = scene.images.at(tex.source);
+
+                                    size_t s = image.image.size() * sizeof(TextureData);
+                                    cudaMalloc(&dev_diffuseTex, s);
+                                    cudaMemcpy(dev_diffuseTex, &image.image.at(0), s, cudaMemcpyHostToDevice);
+
+                                    diffuseTexWidth = image.width;
+                                    diffuseTexHeight = image.height;
+
+                                    checkCUDAError("Set Texture Image data");
+                                }
+                            }
+                        }
+
+                        // TODO: write your code for other materails
+                        // You may have to take a look at tinygltfloader
+                        // You can also use the above code loading diffuse material as a start point
+                    }
+
+
+                    // ---------Node hierarchy transform--------
+                    cudaDeviceSynchronize();
+
+                    dim3 numBlocksNodeTransform((numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+                    _nodeMatrixTransform << < numBlocksNodeTransform, numThreadsPerBlock >> > (
+                            numVertices,
+                                    dev_position,
+                                    dev_normal,
+                                    matrix,
+                                    matrixNormal);
+
+                    checkCUDAError("Node hierarchy transformation");
+
+                    // at the end of the for loop of primitive
+                    // push dev pointers to map
+                    primitiveVector.push_back(PrimitiveDevBufPointers{
+                            primitive.mode,
+                            primitiveType,
+                            numPrimitives,
+                            numIndices,
+                            numVertices,
+
+                            dev_indices,
+                            dev_position,
+                            dev_normal,
+                            dev_texcoord0,
+
+                            dev_diffuseTex,
+                            diffuseTexWidth,
+                            diffuseTexHeight,
+
+                            dev_vertexOut    //VertexOut
+                    });
+
+                    totalNumPrimitives += numPrimitives;
+
+                } // for each primitive
+
+            } // for each mesh
+
+        } // for each node
 
-						BufferByte * dev_bufferView = bufferViewDevPointers.at(accessor.bufferView);
-						BufferByte ** dev_attribute = NULL;
+    }
 
-						numVertices = accessor.count;
-						int componentTypeByteSize;
 
-						// Note: since the type of our attribute array (dev_position) is static (float32)
-						// We assume the glTF model attribute type are 5126(FLOAT) here
+    // 3. Malloc for dev_primitives
+    {
+        cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive));
+    }
 
-						if (it->first.compare("POSITION") == 0) {
-							componentTypeByteSize = sizeof(VertexAttributePosition) / n;
-							dev_attribute = (BufferByte**)&dev_position;
-						}
-						else if (it->first.compare("NORMAL") == 0) {
-							componentTypeByteSize = sizeof(VertexAttributeNormal) / n;
-							dev_attribute = (BufferByte**)&dev_normal;
-						}
-						else if (it->first.compare("TEXCOORD_0") == 0) {
-							componentTypeByteSize = sizeof(VertexAttributeTexcoord) / n;
-							dev_attribute = (BufferByte**)&dev_texcoord0;
-						}
 
-						std::cout << accessor.bufferView << "  -  " << it->second << "  -  " << it->first << '\n';
-
-						dim3 numThreadsPerBlock(128);
-						dim3 numBlocks((n * numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
-						int byteLength = numVertices * n * componentTypeByteSize;
-						cudaMalloc(dev_attribute, byteLength);
-
-						_deviceBufferCopy << <numBlocks, numThreadsPerBlock >> > (
-							n * numVertices,
-							*dev_attribute,
-							dev_bufferView,
-							n,
-							accessor.byteStride,
-							accessor.byteOffset,
-							componentTypeByteSize);
-
-						std::string msg = "Set Attribute Buffer: " + it->first;
-						checkCUDAError(msg.c_str());
-					}
-
-					// malloc for VertexOut
-					VertexOut* dev_vertexOut;
-					cudaMalloc(&dev_vertexOut, numVertices * sizeof(VertexOut));
-					checkCUDAError("Malloc VertexOut Buffer");
-
-					// ----------Materials-------------
-
-					// You can only worry about this part once you started to 
-					// implement textures for your rasterizer
-					TextureData* dev_diffuseTex = NULL;
-					int diffuseTexWidth = 0;
-					int diffuseTexHeight = 0;
-					if (!primitive.material.empty()) {
-						const tinygltf::Material &mat = scene.materials.at(primitive.material);
-						printf("material.name = %s\n", mat.name.c_str());
-
-						if (mat.values.find("diffuse") != mat.values.end()) {
-							std::string diffuseTexName = mat.values.at("diffuse").string_value;
-							if (scene.textures.find(diffuseTexName) != scene.textures.end()) {
-								const tinygltf::Texture &tex = scene.textures.at(diffuseTexName);
-								if (scene.images.find(tex.source) != scene.images.end()) {
-									const tinygltf::Image &image = scene.images.at(tex.source);
-
-									size_t s = image.image.size() * sizeof(TextureData);
-									cudaMalloc(&dev_diffuseTex, s);
-									cudaMemcpy(dev_diffuseTex, &image.image.at(0), s, cudaMemcpyHostToDevice);
-									
-									diffuseTexWidth = image.width;
-									diffuseTexHeight = image.height;
-
-									checkCUDAError("Set Texture Image data");
-								}
-							}
-						}
+    // Finally, cudaFree raw dev_bufferViews
+    {
 
-						// TODO: write your code for other materails
-						// You may have to take a look at tinygltfloader
-						// You can also use the above code loading diffuse material as a start point 
-					}
+        std::map<std::string, BufferByte *>::const_iterator it(bufferViewDevPointers.begin());
+        std::map<std::string, BufferByte *>::const_iterator itEnd(bufferViewDevPointers.end());
 
+        //bufferViewDevPointers
 
-					// ---------Node hierarchy transform--------
-					cudaDeviceSynchronize();
-					
-					dim3 numBlocksNodeTransform((numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
-					_nodeMatrixTransform << <numBlocksNodeTransform, numThreadsPerBlock >> > (
-						numVertices,
-						dev_position,
-						dev_normal,
-						matrix,
-						matrixNormal);
+        for (; it != itEnd; it++) {
+            cudaFree(it->second);
+        }
 
-					checkCUDAError("Node hierarchy transformation");
+        checkCUDAError("Free BufferView Device Mem");
+    }
 
-					// at the end of the for loop of primitive
-					// push dev pointers to map
-					primitiveVector.push_back(PrimitiveDevBufPointers{
-						primitive.mode,
-						primitiveType,
-						numPrimitives,
-						numIndices,
-						numVertices,
 
-						dev_indices,
-						dev_position,
-						dev_normal,
-						dev_texcoord0,
+}
 
-						dev_diffuseTex,
-						diffuseTexWidth,
-						diffuseTexHeight,
 
-						dev_vertexOut	//VertexOut
-					});
+__global__
+void _vertexTransformAndAssembly(
+        int numVertices,
+        PrimitiveDevBufPointers primitive,
+        glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal,
+        int width, int height) {
+
+    // vertex id
+    int vid = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (vid < numVertices) {
+
+        // TODO: Apply vertex transformation here
+        // Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
+        glm::vec4 pos = MVP * glm::vec4(primitive.dev_position[vid], 1.0f);
+        // Then divide the pos by its w element to transform into NDC space
+        pos /= pos.w;
+        // Finally transform x and y to viewport space
+        pos.x = (float)width * (1.f - pos.x) / 2.f;
+        pos.y = (float)height * (1.f - pos.y) / 2.f;
+        // TODO: Apply vertex assembly here
+        // Assemble all attribute arraies into the primitive array
+        primitive.dev_verticesOut[vid].pos = pos;
+        primitive.dev_verticesOut[vid].eyePos = glm::vec3(MV * glm::vec4(primitive.dev_position[vid], 1.0f));
+        primitive.dev_verticesOut[vid].eyeNor = glm::normalize(MV_normal * primitive.dev_normal[vid]);
+
+        if (primitive.dev_diffuseTex != NULL) {
+            primitive.dev_verticesOut[vid].dev_diffuseTex = primitive.dev_diffuseTex;
+            primitive.dev_verticesOut[vid].texcoord0 = primitive.dev_texcoord0[vid];
+            primitive.dev_verticesOut[vid].texHeight = primitive.diffuseTexHeight;
+            primitive.dev_verticesOut[vid].texWidth = primitive.diffuseTexWidth;
+
+        }
+        else {
+            primitive.dev_verticesOut[vid].dev_diffuseTex = NULL;
+
+        }
+    }
+}
 
-					totalNumPrimitives += numPrimitives;
 
-				} // for each primitive
+static int curPrimitiveBeginId = 0;
 
-			} // for each mesh
+__global__
+void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive *dev_primitives,
+                        PrimitiveDevBufPointers primitive) {
 
-		} // for each node
+    // index id
+    int iid = (blockIdx.x * blockDim.x) + threadIdx.x;
 
-	}
-	
+    if (iid < numIndices) {
 
-	// 3. Malloc for dev_primitives
-	{
-		cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive));
-	}
-	
+        // TODO: uncomment the following code for a start
+        // This is primitive assembly for triangles
 
-	// Finally, cudaFree raw dev_bufferViews
-	{
+        int pid;    // id for cur primitives vector
+        if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
+            pid = iid / (int)primitive.primitiveType;
+            dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
+                    = primitive.dev_verticesOut[primitive.dev_indices[iid]];
+            dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType].col
+                    = glm::vec3(0.9f);
+        }
 
-		std::map<std::string, BufferByte*>::const_iterator it(bufferViewDevPointers.begin());
-		std::map<std::string, BufferByte*>::const_iterator itEnd(bufferViewDevPointers.end());
-			
-			//bufferViewDevPointers
 
-		for (; it != itEnd; it++) {
-			cudaFree(it->second);
-		}
+        // TODO: other primitive types (point, line)
+    }
 
-		checkCUDAError("Free BufferView Device Mem");
-	}
+}
 
+struct primitive_culling {
+    __host__ __device__
 
+    bool operator()(const Primitive &p) {
+        return p.cull;
+    }
+};
+
+// wikipedia back-face culling
+__global__ void _backfaceCulling(const int numPrims, Primitive *dev_primitives) {
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index < numPrims) {
+        glm::vec3 v1 = dev_primitives[index].v[1].eyePos - dev_primitives[index].v[0].eyePos;
+        glm::vec3 v2 = dev_primitives[index].v[2].eyePos - dev_primitives[index].v[0].eyePos;
+        glm::vec3 normal = glm::cross(v1, v2);
+        dev_primitives[index].cull = normal.z >= 0.f;
+    }
 }
 
+// rasterization for points and lines
+// reference: http://www.cs.cornell.edu/courses/cs4620/2010fa/lectures/07pipeline.pdf
+__global__ void _rasterizePoint(const int numPrims, const int height, const int width,
+                                Primitive *dev_primitives, Fragment *dev_fragment) {
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index < numPrims) {
+        Primitive curr_prim = dev_primitives[index];
+        glm::vec3 tri[3] = { glm::vec3(curr_prim.v[0].pos),
+                             glm::vec3(curr_prim.v[1].pos), glm::vec3(curr_prim.v[2].pos) };
+        for (int i = 0; i < 3; i++) {
+            int x = (int)tri[i].x;
+            int y = (int)tri[i].y;
+            if (x >= 0 && x < width && y >= 0 && y < height) {
+                int pixel = x + y * width;
+                dev_fragment[pixel].color = glm::vec3(0.4f, 0.8f, 0.4f);
+            }
+        }
+    }
 
-
-__global__ 
-void _vertexTransformAndAssembly(
-	int numVertices, 
-	PrimitiveDevBufPointers primitive, 
-	glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal, 
-	int width, int height) {
-
-	// vertex id
-	int vid = (blockIdx.x * blockDim.x) + threadIdx.x;
-	if (vid < numVertices) {
-
-		// TODO: Apply vertex transformation here
-		// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
-		// Then divide the pos by its w element to transform into NDC space
-		// Finally transform x and y to viewport space
-
-		// TODO: Apply vertex assembly here
-		// Assemble all attribute arraies into the primitive array
-		
-	}
 }
 
+__host__ __device__
 
+void _bresenham(glm::vec3 pt1, glm::vec3 pt2, const int height, const int width,
+                Fragment *dev_fragment) {
+    float x1 = glm::clamp(pt1[0], 0.f, (float)(width - 1));
+    float x2 = glm::clamp(pt2[0], 0.f, (float)(width - 1));
+    float y1 = glm::clamp(pt1[1], 0.f, (float)(height - 1));
+    float y2 = glm::clamp(pt2[1], 0.f, (float)(height - 1));
+    const glm::vec3 color(0.8f, 0.8f, 0.8f);
 
-static int curPrimitiveBeginId = 0;
-
-__global__ 
-void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_primitives, PrimitiveDevBufPointers primitive) {
+    bool swapped = (fabs(x2 - x1) < fabs(y2 - y1));
 
-	// index id
-	int iid = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (swapped) {
+        swap(x1, y1);
+        swap(x2, y2);
+    }
 
-	if (iid < numIndices) {
+    if (x1 > x2) {
+        swap(x1, x2);
+        swap(y1, y2);
+    }
 
-		// TODO: uncomment the following code for a start
-		// This is primitive assembly for triangles
+    const float dx = x2 - x1;
+    const float dy = fabs(y2 - y1);
+
+    float err = dx / 2.0f;
+    const int step_size = (y1 < y2) ? 1 : -1;
+    int y = (int)y1;
+
+    int idx;
+
+    for (int x = (int)x1; x < (int)x2; x++) {
+        if (swapped) {
+            idx = y + x * width;
+            dev_fragment[idx].color = color;
+        }
+        else {
+            idx = x + y * width;
+            dev_fragment[idx].color = color;
+        }
+        err -= dy;
+        if (err < 0) {
+            y += step_size;
+            err += dx;
+        }
+    }
 
-		//int pid;	// id for cur primitives vector
-		//if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
-		//	pid = iid / (int)primitive.primitiveType;
-		//	dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
-		//		= primitive.dev_verticesOut[primitive.dev_indices[iid]];
-		//}
+}
 
+__global__ void _rasterizeLine(const int numPrims, const int height, const int width,
+                               Primitive *dev_primitives, Fragment *dev_fragment) {
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index < numPrims) {
+        Primitive curr_prim = dev_primitives[index];
+        glm::vec3 tri[3] = { glm::vec3(curr_prim.v[0].pos),
+                             glm::vec3(curr_prim.v[1].pos), glm::vec3(curr_prim.v[2].pos) };
+
+        _bresenham(tri[0], tri[1], height, width, dev_fragment);
+        _bresenham(tri[0], tri[2], height, width, dev_fragment);
+        _bresenham(tri[1], tri[2], height, width, dev_fragment);
+    }
 
-		// TODO: other primitive types (point, line)
-	}
-	
 }
 
 
+__global__ void _rasterizeTraingle(const int numPrims, const int height, const int width,
+                                   Primitive *dev_primitives, int *dev_depth, Fragment *dev_fragment) {
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index < numPrims) {
+        Primitive curr_prim = dev_primitives[index];
+        glm::vec3 tri[3] = { glm::vec3(curr_prim.v[0].pos),
+                             glm::vec3(curr_prim.v[1].pos), glm::vec3(curr_prim.v[2].pos) };
+
+        AABB curr_box = getAABBForTriangle(tri);
+
+        /* reference: cis 460 slides
+        * scratchpixel.com perspective correct interpolation vertex attributes & wikipedia texture mapping
+        */
+        for (int i = imax(curr_box.min.x, 0); i < imin(curr_box.max.x, width); i++) {
+            for (int j = imax(curr_box.min.y, 0); j < imin(curr_box.max.y, height); j++) {
+                glm::vec3 bary_coord = calculateBarycentricCoordinate(tri, glm::vec2(i, j));
+                int pixel = j * width + i;// huge bug here omg!
+                if (isBarycentricCoordInBounds(bary_coord)) {
+                    // use color with smallest z-coordinate
+                    int depth = static_cast<int>(glm::clamp(-getZAtCoordinate(bary_coord, tri), -1.f, 1.f) * INT_MAX);
+                    atomicMin(&dev_depth[pixel], depth);
+
+                    if (depth == dev_depth[pixel]) {
+                        dev_fragment[pixel].eyeNor = glm::normalize(bary_coord.x * curr_prim.v[0].eyeNor
+                                                                    + bary_coord.y * curr_prim.v[1].eyeNor +
+                                                                    bary_coord.z * curr_prim.v[2].eyeNor);
+                        dev_fragment[pixel].color = bary_coord.x * curr_prim.v[0].col
+                                                    + bary_coord.y * curr_prim.v[1].col +
+                                                    bary_coord.z * curr_prim.v[2].col;
+
+                        dev_fragment[pixel].eyePos = bary_coord.x * curr_prim.v[0].eyePos
+                                                     + bary_coord.y * curr_prim.v[1].eyePos +
+                                                     bary_coord.z * curr_prim.v[2].eyePos;
+
+
+#if CORRECT_PROSPECTIVE
+                        glm::vec3 eyePosition[3] = { glm::vec3(curr_prim.v[0].eyePos),
+                                                     glm::vec3(curr_prim.v[1].eyePos), glm::vec3(curr_prim.v[2].eyePos) };
+
+                        float z = computeOneOverZ(bary_coord, eyePosition);
+
+                        glm::vec3 uv[3] = { glm::vec3(curr_prim.v[0].texcoord0, 0.f),
+                                            glm::vec3(curr_prim.v[1].texcoord0, 0.f),
+                                            glm::vec3(curr_prim.v[2].texcoord0, 0.f) };
+
+                        if (curr_prim.v[0].dev_diffuseTex != NULL) {
+                            dev_fragment[pixel].dev_diffuseTex = curr_prim.v[0].dev_diffuseTex;
+                            dev_fragment[pixel].texHeight = curr_prim.v[0].texHeight;
+                            dev_fragment[pixel].texWidth = curr_prim.v[0].texWidth;
+                            dev_fragment[pixel].texcoord0 = glm::vec2(
+                                    correctCoordPerspective(z, bary_coord, eyePosition, uv));
+                        }
+                        else {
+                            dev_fragment[pixel].dev_diffuseTex = NULL;
+                        }
+
+#else
+                        if (curr_prim.v[0].dev_diffuseTex != NULL) {
+							dev_fragment[pixel].dev_diffuseTex = curr_prim.v[0].dev_diffuseTex;
+							dev_fragment[pixel].texHeight = curr_prim.v[0].texHeight;
+							dev_fragment[pixel].texWidth = curr_prim.v[0].texWidth;
+							dev_fragment[pixel].texcoord0 = bary_coord.x * curr_prim.v[0].texcoord0
+								+ bary_coord.y * curr_prim.v[1].texcoord0 +
+								bary_coord.z * curr_prim.v[2].texcoord0;
+						}
+						else {
+							dev_fragment[pixel].dev_diffuseTex = NULL;
+						}
+#endif
+                    }
+
+                }
+            }
+        }
+    }
+}
+
 
 /**
- * Perform rasterization.
- */
-void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) {
+* Perform rasterization.
+*/
+void rasterize(uchar4 *pbo, const glm::mat4 &MVP, const glm::mat4 &MV, const glm::mat3 MV_normal, int primitive_type) {
     int sideLength2d = 8;
     dim3 blockSize2d(sideLength2d, sideLength2d);
-    dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
-		(height - 1) / blockSize2d.y + 1);
-
-	// Execute your rasterization pipeline here
-	// (See README for rasterization pipeline outline.)
-
-	// Vertex Process & primitive assembly
-	{
-		curPrimitiveBeginId = 0;
-		dim3 numThreadsPerBlock(128);
-
-		auto it = mesh2PrimitivesMap.begin();
-		auto itEnd = mesh2PrimitivesMap.end();
-
-		for (; it != itEnd; ++it) {
-			auto p = (it->second).begin();	// each primitive
-			auto pEnd = (it->second).end();
-			for (; p != pEnd; ++p) {
-				dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
-				dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
-
-				_vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height);
-				checkCUDAError("Vertex Processing");
-				cudaDeviceSynchronize();
-				_primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
-					(p->numIndices, 
-					curPrimitiveBeginId, 
-					dev_primitives, 
-					*p);
-				checkCUDAError("Primitive Assembly");
-
-				curPrimitiveBeginId += p->numPrimitives;
-			}
-		}
-
-		checkCUDAError("Vertex Processing and Primitive Assembly");
-	}
-	
-	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
-	initDepth << <blockCount2d, blockSize2d >> >(width, height, dev_depth);
-	
-	// TODO: rasterize
+    dim3 blockCount2d((width - 1) / blockSize2d.x + 1,
+                      (height - 1) / blockSize2d.y + 1);
+
+    // Execute your rasterization pipeline here
+    // (See README for rasterization pipeline outline.)
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    float miliseconds = 0;
+
+    // Vertex Process & primitive assembly
+    dim3 numThreadsPerBlock(128);
+    {
+        curPrimitiveBeginId = 0;
+
+
+        auto it = mesh2PrimitivesMap.begin();
+        auto itEnd = mesh2PrimitivesMap.end();
+
+        for (; it != itEnd; ++it) {
+            auto p = (it->second).begin();    // each primitive
+            auto pEnd = (it->second).end();
+            for (; p != pEnd; ++p) {
+                dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+                dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+
+                cudaEventRecord(start);
+                _vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >
+                                                                       (p->numVertices, *p, MVP, MV, MV_normal, width, height);
+                checkCUDAError("Vertex Processing");
+                cudaDeviceSynchronize();
+
+                cudaEventRecord(stop);
+                cudaEventSynchronize(stop);
+                cudaEventElapsedTime(&miliseconds, start, stop);
+
+                cudaEventRecord(start);
+                _primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
+                                                             (p->numIndices,
+                                                                     curPrimitiveBeginId,
+                                                                     dev_primitives,
+                                                                     *p);
+                checkCUDAError("Primitive Assembly");
+                cudaEventRecord(stop);
+                cudaEventSynchronize(stop);
+                cudaEventElapsedTime(&miliseconds, start, stop);
+
+                curPrimitiveBeginId += p->numPrimitives;
+            }
+        }
+
+        checkCUDAError("Vertex Processing and Primitive Assembly");
+    }
 
+    cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
+
+    cudaEventRecord(start);
+    initDepth << < blockCount2d, blockSize2d >> > (width, height, dev_depth);
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&miliseconds, start, stop);
+
+    // backface culling
+    dim3 numBlocksForPrimitives;
+#if BACKFACE_CULLING
+    numBlocksForPrimitives = dim3((curPrimitiveBeginId + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+
+	cudaEventRecord(start);
+	_backfaceCulling << < numBlocksForPrimitives, numThreadsPerBlock >> > (curPrimitiveBeginId, dev_primitives);
+	Primitive *culled_primitives = thrust::partition(thrust::device, dev_primitives,
+		dev_primitives + curPrimitiveBeginId, primitive_culling());
+	checkCUDAError("BackFace culling error");
+	cudaEventRecord(stop);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&miliseconds, start, stop);
+	curPrimitiveBeginId = (int)(culled_primitives - dev_primitives);
+#endif
+
+    // TODO: rasterize
+    numBlocksForPrimitives = dim3((curPrimitiveBeginId + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+
+    if (primitive_type == 1) {
+        cudaEventRecord(start);
+        _rasterizePoint << < numBlocksForPrimitives, numThreadsPerBlock >> > (curPrimitiveBeginId, height, width,
+                dev_primitives, dev_fragmentBuffer);
+        checkCUDAError("Point rasterization error");
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+        cudaEventElapsedTime(&miliseconds, start, stop);
+    }
+    else if (primitive_type == 2) {
+        cudaEventRecord(start);
+        _rasterizeLine << < numBlocksForPrimitives, numThreadsPerBlock >> > (curPrimitiveBeginId, height, width,
+                dev_primitives, dev_fragmentBuffer);
+        checkCUDAError("Line rasterization error");
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+        cudaEventElapsedTime(&miliseconds, start, stop);
+    }
+
+    else {
+        cudaEventRecord(start);
+        _rasterizeTraingle << < numBlocksForPrimitives, numThreadsPerBlock >> > (curPrimitiveBeginId, height, width,
+                dev_primitives, dev_depth, dev_fragmentBuffer);
+        checkCUDAError("Traingle rasterization error");
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+        cudaEventElapsedTime(&miliseconds, start, stop);
+    }
 
 
     // Copy depthbuffer colors into framebuffer
-	render << <blockCount2d, blockSize2d >> >(width, height, dev_fragmentBuffer, dev_framebuffer);
-	checkCUDAError("fragment shader");
+    cudaEventRecord(start);
+    render << < blockCount2d, blockSize2d >> > (width, height, dev_fragmentBuffer, dev_framebuffer, primitive_type);
+    checkCUDAError("fragment shader");
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&miliseconds, start, stop);
+
+
     // Copy framebuffer into OpenGL buffer for OpenGL previewing
-    sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
+    cudaEventRecord(start);
+    sendImageToPBO << < blockCount2d, blockSize2d >> > (pbo, screen_width, screen_height, dev_framebuffer);
     checkCUDAError("copy render result to pbo");
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&miliseconds, start, stop);
 }
 
 /**
- * Called once at the end of the program to free CUDA memory.
- */
+* Called once at the end of the program to free CUDA memory.
+*/
 void rasterizeFree() {
 
     // deconstruct primitives attribute/indices device buffer
 
-	auto it(mesh2PrimitivesMap.begin());
-	auto itEnd(mesh2PrimitivesMap.end());
-	for (; it != itEnd; ++it) {
-		for (auto p = it->second.begin(); p != it->second.end(); ++p) {
-			cudaFree(p->dev_indices);
-			cudaFree(p->dev_position);
-			cudaFree(p->dev_normal);
-			cudaFree(p->dev_texcoord0);
-			cudaFree(p->dev_diffuseTex);
+    auto it(mesh2PrimitivesMap.begin());
+    auto itEnd(mesh2PrimitivesMap.end());
+    for (; it != itEnd; ++it) {
+        for (auto p = it->second.begin(); p != it->second.end(); ++p) {
+            cudaFree(p->dev_indices);
+            cudaFree(p->dev_position);
+            cudaFree(p->dev_normal);
+            cudaFree(p->dev_texcoord0);
+            cudaFree(p->dev_diffuseTex);
 
-			cudaFree(p->dev_verticesOut);
+            cudaFree(p->dev_verticesOut);
 
-			
-			//TODO: release other attributes and materials
-		}
-	}
 
-	////////////
+            //TODO: release other attributes and materials
+        }
+    }
+
+    ////////////
 
     cudaFree(dev_primitives);
     dev_primitives = NULL;
 
-	cudaFree(dev_fragmentBuffer);
-	dev_fragmentBuffer = NULL;
+    cudaFree(dev_fragmentBuffer);
+    dev_fragmentBuffer = NULL;
 
     cudaFree(dev_framebuffer);
     dev_framebuffer = NULL;
 
-	cudaFree(dev_depth);
-	dev_depth = NULL;
+    cudaFree(dev_depth);
+    dev_depth = NULL;
 
     checkCUDAError("rasterize Free");
-}
+}
\ No newline at end of file
diff --git a/src/rasterize.h b/src/rasterize.h
index 560aae9..ee36479 100644
--- a/src/rasterize.h
+++ b/src/rasterize.h
@@ -1,10 +1,10 @@
 /**
- * @file      rasterize.h
- * @brief     CUDA-accelerated rasterization pipeline.
- * @authors   Skeleton code: Yining Karl Li, Kai Ninomiya, Shuai Shao (Shrek)
- * @date      2012-2016
- * @copyright University of Pennsylvania & STUDENT
- */
+* @file      rasterize.h
+* @brief     CUDA-accelerated rasterization pipeline.
+* @authors   Skeleton code: Yining Karl Li, Kai Ninomiya, Shuai Shao (Shrek)
+* @date      2012-2016
+* @copyright University of Pennsylvania & STUDENT
+*/
 
 #pragma once
 
@@ -12,13 +12,13 @@
 #include <glm/gtc/matrix_transform.hpp>
 #include <glm/gtx/transform.hpp>
 
-namespace tinygltf{
-	class Scene;
+namespace tinygltf {
+    class Scene;
 }
 
 
 void rasterizeInit(int width, int height);
 void rasterizeSetBuffers(const tinygltf::Scene & scene);
 
-void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal);
+void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal, int primitive_type);
 void rasterizeFree();
diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h
index 46c701e..9bec127 100644
--- a/src/rasterizeTools.h
+++ b/src/rasterizeTools.h
@@ -99,3 +99,21 @@ float getZAtCoordinate(const glm::vec3 barycentricCoord, const glm::vec3 tri[3])
            + barycentricCoord.y * tri[1].z
            + barycentricCoord.z * tri[2].z);
 }
+
+/**
+ * For a given barycentric coordinate, compute the corresponding z position
+ * (i.e. depth) on the triangle with perspective projection
+ */
+
+__host__ __device__ static
+float computeOneOverZ(const glm::vec3 barycentricCoord, const glm::vec3 tri[3]){
+    return 1.0f / (barycentricCoord.x / tri[0].z + barycentricCoord.y / tri[1].z + barycentricCoord.z / tri[2].z);
+}
+
+__host__ __device__ static
+glm::vec3 correctCoordPerspective(const float z, glm::vec3 barycentricCoord, const glm::vec3 tri[3], const glm::vec3 coord[3]){
+    return z * glm::vec3(coord[0] * barycentricCoord.x / tri[0].z +
+                                coord[1] * barycentricCoord.y / tri[1].z +
+                                coord[2] * barycentricCoord.z / tri[2].z);
+}
+