diff --git a/.gitignore b/.gitignore
index 71efc99..83b2190 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@
 *.xcodeproj
 build
 
+gltfs/mode_list.txt
+
 # Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode
 
 ### Linux ###
diff --git a/README.md b/README.md
index cad1abd..929c150 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,149 @@
 CUDA Rasterizer
 ===============
 
-[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md)
-
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Hanming Zhang
+* Tested on: Windows 10 Education, i7-6700K @ 4.00GHz 16.0GB, GTX 980 4096MB (Personal Desktop)
+
+
+Project Features
+================
+
+### Representative gifs :
+![](img/1.gif?) |  ![](img/0.gif?)
+------------ | -------------
+white cow + bloom(post processing) | white cow
+![](img/2.gif?) |  ![](img/3.gif?)
+wireframe cow + bloom(post processing) | wireframe cow
+
+### CUDA Rasterizer Features :
+ -  Basic rasterizer pipeline stages : vertex shading, primitive assembly, rasterization, Lambert fragment shading, depth buffer
+ -  Self Rotating table(rotation speed is fixed & Shared memory used)
+ -  Bloom post processing (Downscale + brightness filter + Gaussian blur(shared memory used))
+ -  Back face culling (both naively in rasterizer or an independent pipeline stage(remove primitives using thrust::remove_if))
+ -  Correct color (vertex property) interpolation between vertices on a primitive
+ -  Wireframe / Points
+ -  SSAA & \*MSAA (very strictly speaking, it's a "stupid" MSAA so far, still have problems, but it works in some "stupid" way, see related analysis below)
+ -  bilinear filtering
+
+### Controls :
+ - Left mouse button to rotate Camera
+ - Right mouse button to move Camera
+ - Middle mouse button to zoom in/out
+ - 1, 2, 3  to switch to rasterize whole triangle / wireframe / points mode
+ - P key to turn on / off bloom post effect
+
+
+### About Toggleable Macros :
+ To enable/disable some features, just uncomment/comment Macros in ***rasterizer.cu***
+
+ - #define **GAUSSIANBLUR_SHAREDMEMORY** : Shared memory use in Gaussian Blur
+ - #define **SSAAx2** : SSAA, in our case, (width * 2) * (height * 2)
+ - #define **MSAAx2** : it's a "stupid" MSAA so far, still have problems, but it works in some "stupid" way, see related analysis below
+ - #define **BACKFACE_CULLING_IN_PIPELINE** : back face culling in pipeline  (remove unwanted primitive using thrust::remove_if)
+ - #define **BACKFACE_CULLING_IN_RASTERIZER** : back face culling in a naive way (directly do test in rasterize kernel)
+ - #define **CORRECT_COLOR_LERP** :  Correct color interpolation between points on a primitive
+ - #define **BILINEAR_TEXTURE_FILTER** : texture bilinear filtering
+
+### Project Analysis (Under x64 Release mode)
+- #### Basic rasterizer pipeline analysis
+  ##### In this analysis, no special effects are used, bloom post processing, correct color lerp, back face culling and AA are turned off. Tests are for rasterize whole triangle mode. Each test run 10 seconds. Data are from Nsight Performance Test. cow, flower and Cesium Milk truck glTFs are used here.
+
+
+  ![](img/basicPipelineAnalysis.jpg)
+
+  ##### Analysis:
+  First of all, broadly speaking, rasterizer_fill kernel takes up most of time. It's understandable since what rasterizer_fill does is rasterizing each primitive, checking each pixel of that primitive's bounding box to see if it's inside the triangle, doing atomic depth buffer writing and depth test, and finally writing the fragment buffer if it pass tests.
+  Cow.gltf has the highest average FPS among three, it's mainly because it's relatively simple primitives(triangles) layout, and in most cases, there won't be so many fragments overlapping on the same pixel(maybe 3, 4 at maximum). As a result, it's naturally more efficient than the other two when rasterizing, and there are more frames generated(thus render kernel is called more frequently and higher FPS).
+  Flower.gltf is known for its fragment overlapping. it's complex primitives(triangles) layout determines that there will be so many fragments generated in the same pixel. Most of them are redundant and actually only one fragment will be used in fragment shading stage. Also, because of this, more time will be wasted on atomic depth buffer writing, which causes more waiting and waste time. As a result, less frames generated and lower average FPS.
+  CesiumMilkTruck.gltf actually "only" has 3624 primitives(triangles), which is not a so big number compared to cow and flower. But it is composed of 5 mesh components(window, glass, truck...), which means vertex shading and primitive assembly kernels will be repeatedly called 5 times in one frame, so is rasterizer_fill kernel. Besides that, since it's a heavily textured mesh, it takes a lot of extra time to fetch texture color from global memory. So, less frames generated and lower FPS.
+
+
+
+- #### Rasterizer pipeline with bloom post processing stages analysis
+  ##### In this analysis, bloom post processing is turned on based on basic pipeline. Gaussian blur with and without using shared memory is compared. Each test run 10 seconds. Cow.gltf is used. Data are from Nsight Performance Test. Bloom stage post processing pipeline is shown as below :
+
+  ![](img/bloomPipeLine.jpg)
+
+  About bloom post effect: the basic idea about bloom effect is actually simple. First what brightness filter does here is summing up every component of color value(r, g, b) of frame buffer, setting a threshold and only keep values that passes it. Then downscale our frame by 10. The reason of doing this is that we don't want to waste time sampling a so large area of pixels("kernel size" in Gaussian blur) during Gaussian blur. If we sample a downscaled image, it has the same effect as Gaussian blur a large area of pixels.
+  Also, to make Gaussian blur more efficient, instead of sampling the whole area, we first sample horizontally and then vertically. This means that, for example, in our case, we sample an area of 11x11 pixels, and if we sample horizontally and then vertically, 11 + 11 = 22 samples are needed, and if we sample an area, 11x11 = 121 samples are needed. Finally, we just combine the blur result with the original frame buffer.
+
+  Main reference : [OpenGL Bloom Tutorial by Philip Rideout](http://prideout.net/archive/bloom/index.php)  
+
+  Here are two pure Gaussian blur image :
+
+![](img/bloom_gaussianBlur.jpg) | ![](img/GaussianBlur+5TimeScaleDown.jpg)
+-------------------------- | ----------------------------
+GassianBlur white cow | GassianBlur wireframe cow + downscale by 5
+
+  ![](img/bloomPipelineAnalysis.jpg)
+
+  ##### Analysis:
+  Basically, 5 extra stages in bloom post processing pipeline, but fortunately, not so many extra time is used. The reason is pretty simple, except for brightness filter and combining frame buffer, downscale sample, horizontal and vertical Gaussian blur all happens on a (width/10) x (height/10) frame buffer (although I add 20 fringe pixels horizontally and vertically in order to more easily read shared memory). Besides that, the input is only a frame buffer, and we can forget about all complex fragment buffer stuff here and in our case, the input color for frame buffer is fixed, so no so much extra time is cost and FPS is not influenced so much.
+  In terms of shared memory I used here. It's super straight-forward, and far more less global memory read needed if we use shared memory(we first store a tile into share memory, and tile size is decided by block size). So we can see 100% improvement in case of horizontal and vertical Gaussian blur.
+
+
+
+- #### Back face culling analysis
+  ##### In this analysis, I use basic rasterizer pipeline. No special effects is turned on. Cow.gltf is used. Each test run 10 seconds.
+  In terms of back face culling, I basically try 2 ways :
+  1. add an if statement in rasterize stage
+  2. Before rasterize stage, copy device primitive array to an new array, and use thrust::remove_if to remove back facing primitives
+
+  ![](img/backFaceCullingAnalysis.jpg)
+
+  ##### Analysis:
+  The result show that in case of cow.gltf, back face culling doesn't have apparent influence. For the first back face culling way, if we directly add an if statement in our rasterize stage, there will be a lot of divergence happening. For the second method, we need extra time to do CUDA device memory to device memory copy and thrust::remove_if. So as we can see above, the time rasterizer_fill used is less than the other two.  
+
+
+
+- #### Correct color interpolation between vertices on a primitive
+  ##### I assign red, green and blue color to each vertex of our triangles, and interpolate color for each fragment. We can get a colorful cow like below. BTW, I turn on bloom effect here, which I think is cool.
+  Acutally not only color, any property should use perspective correct interpolation, including normal or position. Like this duck, looks pretty smooth(correct normal interpolation) using Lambert shading
+
+![](img/color_correct_lerp.jpg) | ![](img/colorLerp_bloom.jpg) | ![](img/duck.jpg) | ![](img/milkTruck.jpg)
+-------------------------- | -------------------------- | ---------------------------- | ----------------------------
+correct color lerp triangle | cow + color lerp + bloom | duck | cesium milk truck
+
+
+
+- #### wireframe and points
+
+![](img/3.gif) | ![](img/5.gif)
+-------------------------- | ----------------------------
+wireframe cow | point cow
+
+
+
+- #### AA analysis
+  ##### In this analysis, I use basic rasterizer pipeline. No special effects is turned on. Cow.gltf is used.
+  1. SSAAx2 : this method is pretty straight-forward, every stage in our pipeline will handle a (2 \* width) * (2 \* height) size frame buffer. Then finally, before it is sent to PBO and let OpenGL render it, we sample 4 pixels and average its color value to get one pixel color value we want.
+  2. \*MSAAx2 : The main reason why I call MSAA I implement "stupid" here is that the fragment buffer size in our project is fixed. There is only width \* height size fragment buffer. However, ideally, fragment buffer size should be dynamic, so that we can have more fragments on geometry edges, and then in fragment shading stage, we shade all of them, and maybe get an average color where there are several fragments in one pixel. As a result, considering there is no time for me to reconstruct everything to get a dynamic size fragment buffer, I have to select between width \* height size or (2 \* width) \* (2 \* height) to realize MSAA. So the basic algorithm I use here is :
+
+    ![](img/SSAA_illustartion.jpg)
+
+    - instead of sampling every red dot we sample in SSAA, we try to sample blue dots. (since we use a (2 \* width) \* (2 \* height) size fragment buffer, it actually lay out like red dots). As a result, our stride should be 2 (add 2 for each iteration) and each iteration should start with adding 0.5(blue dot position)
+    - When the blue dot is close to geometry edge(like the pink line I draw in picture), we sample all four red dots in that quad. When blue dot is in the geometry(triangle), we sample blue dot once and use that sampled fragment value to fill all four red subsamples fragment buffers.
+
+   as a result, a lot of time is used detect whether our sampled point is near the edge and fill all sub sample fragment buffers. These extra actions will definitely slow down the MSAA process, which make it slower than SSAA and that's the reason why I call it "stupid". Although some noise points appearing using this algorithm, but it works anyway. :)
+
+   Here are some detail images:
+
+![](img/no_AA_detail.jpg) | ![](img/SSAAx2_detail.jpg) | ![](img/MSAAx2_detail.jpg)
+-------------------------- | ---------------------------- | ----------------------------
+no AA | SSAAx2 | MSAAx2
+
+
+
+
+- #### bilinear texture filtering
+  ##### When fetching from texture, the texture is very likely not displayed exactly as it is stored. Because of this, most pixels will end up needing to use a point on the texture "between" texels. Bilinear filtering uses points to perform bilinear interpolation between the four texels to the point that the pixel represents. Refers to [Bilinear filtering](https://en.wikipedia.org/wiki/Bilinear_filtering)
 
-### (TODO: Your README)
+![](img/no_bilinear_filter.jpg) | ![](img/bilinear_filter.jpg)
+-------------------------- | ----------------------------
+no bilinear filtering | bilinear filtering
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
 
 
 ### Credits
diff --git a/img/0.gif b/img/0.gif
new file mode 100644
index 0000000..c6e7213
Binary files /dev/null and b/img/0.gif differ
diff --git a/img/1.gif b/img/1.gif
new file mode 100644
index 0000000..4f8a6ee
Binary files /dev/null and b/img/1.gif differ
diff --git a/img/2.gif b/img/2.gif
new file mode 100644
index 0000000..0d236e5
Binary files /dev/null and b/img/2.gif differ
diff --git a/img/3.gif b/img/3.gif
new file mode 100644
index 0000000..3b3d415
Binary files /dev/null and b/img/3.gif differ
diff --git a/img/4.gif b/img/4.gif
new file mode 100644
index 0000000..03b5a36
Binary files /dev/null and b/img/4.gif differ
diff --git a/img/5.gif b/img/5.gif
new file mode 100644
index 0000000..8e74fd6
Binary files /dev/null and b/img/5.gif differ
diff --git a/img/GaussianBlur+5TimeScaleDown.jpg b/img/GaussianBlur+5TimeScaleDown.jpg
new file mode 100644
index 0000000..8e826f7
Binary files /dev/null and b/img/GaussianBlur+5TimeScaleDown.jpg differ
diff --git a/img/GaussianBlur.jpg b/img/GaussianBlur.jpg
new file mode 100644
index 0000000..c7add38
Binary files /dev/null and b/img/GaussianBlur.jpg differ
diff --git a/img/MSAA_error.jpg b/img/MSAA_error.jpg
new file mode 100644
index 0000000..3e08a43
Binary files /dev/null and b/img/MSAA_error.jpg differ
diff --git a/img/MSAAx2.jpg b/img/MSAAx2.jpg
new file mode 100644
index 0000000..9d4489d
Binary files /dev/null and b/img/MSAAx2.jpg differ
diff --git a/img/MSAAx2_detail.jpg b/img/MSAAx2_detail.jpg
new file mode 100644
index 0000000..a880fb8
Binary files /dev/null and b/img/MSAAx2_detail.jpg differ
diff --git a/img/SSAA_illustartion.jpg b/img/SSAA_illustartion.jpg
new file mode 100644
index 0000000..b4c7570
Binary files /dev/null and b/img/SSAA_illustartion.jpg differ
diff --git a/img/SSAAx2.jpg b/img/SSAAx2.jpg
new file mode 100644
index 0000000..76ddbfa
Binary files /dev/null and b/img/SSAAx2.jpg differ
diff --git a/img/SSAAx2_detail.jpg b/img/SSAAx2_detail.jpg
new file mode 100644
index 0000000..ec37ac3
Binary files /dev/null and b/img/SSAAx2_detail.jpg differ
diff --git a/img/backFaceCullingAnalysis.jpg b/img/backFaceCullingAnalysis.jpg
new file mode 100644
index 0000000..4934f42
Binary files /dev/null and b/img/backFaceCullingAnalysis.jpg differ
diff --git a/img/basicPipelineAnalysis.jpg b/img/basicPipelineAnalysis.jpg
new file mode 100644
index 0000000..f5cb489
Binary files /dev/null and b/img/basicPipelineAnalysis.jpg differ
diff --git a/img/bilinear_duck.jpg b/img/bilinear_duck.jpg
new file mode 100644
index 0000000..4a0aefd
Binary files /dev/null and b/img/bilinear_duck.jpg differ
diff --git a/img/bilinear_filter.jpg b/img/bilinear_filter.jpg
new file mode 100644
index 0000000..d2dd2e0
Binary files /dev/null and b/img/bilinear_filter.jpg differ
diff --git a/img/bilinear_filter_2.jpg b/img/bilinear_filter_2.jpg
new file mode 100644
index 0000000..e7f6633
Binary files /dev/null and b/img/bilinear_filter_2.jpg differ
diff --git a/img/bilinear_filter_3.jpg b/img/bilinear_filter_3.jpg
new file mode 100644
index 0000000..a5056f8
Binary files /dev/null and b/img/bilinear_filter_3.jpg differ
diff --git a/img/bilinear_filter_4.jpg b/img/bilinear_filter_4.jpg
new file mode 100644
index 0000000..dd570dd
Binary files /dev/null and b/img/bilinear_filter_4.jpg differ
diff --git a/img/bilinear_filter_5.jpg b/img/bilinear_filter_5.jpg
new file mode 100644
index 0000000..be9100b
Binary files /dev/null and b/img/bilinear_filter_5.jpg differ
diff --git a/img/bloomPipeLine.jpg b/img/bloomPipeLine.jpg
new file mode 100644
index 0000000..4be640a
Binary files /dev/null and b/img/bloomPipeLine.jpg differ
diff --git a/img/bloomPipelineAnalysis.jpg b/img/bloomPipelineAnalysis.jpg
new file mode 100644
index 0000000..0e3402c
Binary files /dev/null and b/img/bloomPipelineAnalysis.jpg differ
diff --git a/img/bloom_gaussianBlur.jpg b/img/bloom_gaussianBlur.jpg
new file mode 100644
index 0000000..07d5ac3
Binary files /dev/null and b/img/bloom_gaussianBlur.jpg differ
diff --git a/img/bloom_position error.jpg b/img/bloom_position error.jpg
new file mode 100644
index 0000000..a1b61b7
Binary files /dev/null and b/img/bloom_position error.jpg differ
diff --git a/img/colorLerp.jpg b/img/colorLerp.jpg
new file mode 100644
index 0000000..beb981e
Binary files /dev/null and b/img/colorLerp.jpg differ
diff --git a/img/colorLerp_bloom.jpg b/img/colorLerp_bloom.jpg
new file mode 100644
index 0000000..c15ad19
Binary files /dev/null and b/img/colorLerp_bloom.jpg differ
diff --git a/img/color_correct_lerp.jpg b/img/color_correct_lerp.jpg
new file mode 100644
index 0000000..33c3970
Binary files /dev/null and b/img/color_correct_lerp.jpg differ
diff --git a/img/duck.jpg b/img/duck.jpg
new file mode 100644
index 0000000..75179f7
Binary files /dev/null and b/img/duck.jpg differ
diff --git a/img/milkTruck.jpg b/img/milkTruck.jpg
new file mode 100644
index 0000000..0d67a00
Binary files /dev/null and b/img/milkTruck.jpg differ
diff --git a/img/no_AA.jpg b/img/no_AA.jpg
new file mode 100644
index 0000000..4e4f1a4
Binary files /dev/null and b/img/no_AA.jpg differ
diff --git a/img/no_AA_detail.jpg b/img/no_AA_detail.jpg
new file mode 100644
index 0000000..619e9c3
Binary files /dev/null and b/img/no_AA_detail.jpg differ
diff --git a/img/no_bilinear_filter.jpg b/img/no_bilinear_filter.jpg
new file mode 100644
index 0000000..10eb6f8
Binary files /dev/null and b/img/no_bilinear_filter.jpg differ
diff --git a/reference.txt b/reference.txt
new file mode 100644
index 0000000..bb195d6
--- /dev/null
+++ b/reference.txt
@@ -0,0 +1,3 @@
+
+
+http://prideout.net/archive/bloom/index.php OpenGL Bloom Tutorial by Philip Rideout
diff --git a/src/main.cpp b/src/main.cpp
index 7986959..9a148ac 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -52,6 +52,9 @@ int main(int argc, char **argv) {
 
     frame = 0;
     seconds = time(NULL);
+
+	myShaderTimerSeconds = time(NULL);
+
     fpstracker = 0;
 
     // Launch CUDA/GL
@@ -99,6 +102,14 @@ void mainLoop() {
 float scale = 1.0f;
 float x_trans = 0.0f, y_trans = 0.0f, z_trans = -10.0f;
 float x_angle = 0.0f, y_angle = 0.0f;
+
+int renderMode = 1;
+
+bool openPostProcess = false;
+
+float self_rotation_angle = 0.0f;
+float self_rotation_speed = 0.5f;
+
 void runCuda() {
     // Map OpenGL buffer object for writing from CUDA on a single GPU
     // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
@@ -115,12 +126,28 @@ void runCuda() {
 		* glm::rotate(x_angle, glm::vec3(1.0f, 0.0f, 0.0f))
 		* glm::rotate(y_angle, glm::vec3(0.0f, 1.0f, 0.0f));
 
+
 	glm::mat3 MV_normal = glm::transpose(glm::inverse(glm::mat3(V) * glm::mat3(M)));
 	glm::mat4 MV = V * M;
 	glm::mat4 MVP = P * MV;
 
+	// ------------------ Shader Timer Part ---------------------------
+	//double myShaderTimerSeconds2 = glfwGetTime();
+	//float deltaTime = myShaderTimerSeconds2 - myShaderTimerSeconds;
+	//myShaderTimerSeconds = myShaderTimerSeconds2;
+
+	//self_rotation_angle += (self_rotation_speed * deltaTime);
+
+	//if (self_rotation_angle >= 360.0f) {
+	//	self_rotation_angle = 0.0f;
+	//}
+	// -----------------------------------------------------------------
+	glm::vec3 viewForwardVec = glm::vec3(0.0f, 0.0f, 1.0f);
+
+	glm::mat4 self_Rotate_M = glm::rotate(self_rotation_angle, glm::vec3(0.0f, 1.0f, 0.0f));
+
     cudaGLMapBufferObject((void **)&dptr, pbo);
-	rasterize(dptr, MVP, MV, MV_normal);
+	rasterize(dptr, MVP, MV, MV_normal, renderMode, self_Rotate_M, openPostProcess, viewForwardVec);
     cudaGLUnmapBufferObject(pbo);
 
     frame++;
@@ -165,6 +192,7 @@ bool init(const tinygltf::Scene & scene) {
 	glfwSetCursorPosCallback(window, mouseMotionCallback);
 	glfwSetScrollCallback(window, mouseWheelCallback);
 
+
 	{
 		std::map<std::string, std::vector<std::string> >::const_iterator it(
 			scene.scenes.begin());
@@ -189,6 +217,8 @@ bool init(const tinygltf::Scene & scene) {
     glUseProgram(passthroughProgram);
     glActiveTexture(GL_TEXTURE0);
 
+	myShaderTimerSeconds = glfwGetTime();
+
     return true;
 }
 
@@ -325,9 +355,26 @@ void errorCallback(int error, const char *description) {
 }
 
 void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods) {
-    if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS) {
-        glfwSetWindowShouldClose(window, GL_TRUE);
-    }
+
+	if (action == GLFW_PRESS) {
+		switch (key) {
+		case GLFW_KEY_ESCAPE:
+			glfwSetWindowShouldClose(window, GL_TRUE);
+			break;
+		case GLFW_KEY_1:
+			renderMode = 1;
+			break;
+		case GLFW_KEY_2:
+			renderMode = 2;
+			break;
+		case GLFW_KEY_3:
+			renderMode = 3;
+			break;
+		case GLFW_KEY_P:
+			openPostProcess = !openPostProcess;
+			break;
+		}
+	}
 }
 
 //----------------------------
@@ -396,5 +443,8 @@ void mouseMotionCallback(GLFWwindow* window, double xpos, double ypos)
 void mouseWheelCallback(GLFWwindow* window, double xoffset, double yoffset)
 {
 	const double s = 1.0;	// sensitivity
+
+	if (yoffset > 0 && z_trans > -1.5f) { return; }
+
 	z_trans += (float)(s * yoffset);
 }
diff --git a/src/main.hpp b/src/main.hpp
index 4816fa1..95b8ef1 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -33,6 +33,10 @@ using namespace std;
 int frame;
 int fpstracker;
 double seconds;
+
+double myShaderTimerSeconds;
+
+
 int fps = 0;
 GLuint positionLocation = 0;
 GLuint texcoordsLocation = 1;
@@ -99,6 +103,7 @@ void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods
 //----------------------------
 std::string getFilePathExtension(const std::string &FileName);
 
+
 void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods);
 void mouseMotionCallback(GLFWwindow* window, double xpos, double ypos);
 void mouseWheelCallback(GLFWwindow* window, double xoffset, double yoffset);
\ No newline at end of file
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 1262a09..0f317fc 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -18,6 +18,46 @@
 #include <glm/gtc/quaternion.hpp>
 #include <glm/gtc/matrix_transform.hpp>
 
+#include <thrust/device_vector.h>
+#include <thrust/device_ptr.h>
+#include <thrust/remove.h>
+
+// TODO : do performance test here
+
+// --------------------------------------------
+// Shared memory use in Gaussian Blur
+#define GAUSSIANBLUR_SHAREDMEMORY
+
+// --------------------------------------------
+// SSAA
+//#define SSAAx2
+
+// --------------------------------------------
+// MSAA remains some problems in our project...
+// Fragment buffer size is fixed(width * height)
+// Ideally, dynamic size fragment buffer is wanted
+
+// In my case, I directly use SSAA fragment size in MSAA (2 * width * 2 * height)
+// and this stupid way cause its performance worse than SSAA....
+// Besides, some unwanted artifacts appear here
+
+//#define MSAAx2
+
+// --------------------------------------------
+// Backface culling
+// Pipeline way (remove unwanted primitive using thrust::remove_if)
+//#define BACKFACE_CULLING_IN_PIPELINE
+// naive way (directly do test in rasterizer)
+//#define BACKFACE_CULLING_IN_RASTERIZER
+
+// --------------------------------------------
+// Correct color interpolation between points on a primitive
+//#define CORRECT_COLOR_LERP
+
+
+// --------------------------------------------
+// #define BILINEAR_TEXTURE_FILTER
+
 namespace {
 
 	typedef unsigned short VertexIndex;
@@ -43,10 +83,14 @@ namespace {
 
 		 glm::vec3 eyePos;	// eye space position used for shading
 		 glm::vec3 eyeNor;	// eye space normal used for shading, cuz normal will go wrong after perspective transformation
-		// glm::vec3 col;
+
+#ifdef CORRECT_COLOR_LERP
+		 glm::vec3 col;
+#endif 
+
 		 glm::vec2 texcoord0;
 		 TextureData* dev_diffuseTex = NULL;
-		// int texWidth, texHeight;
+		int diffuseTexWidth, diffuseTexHeight;
 		// ...
 	};
 
@@ -56,16 +100,20 @@ namespace {
 	};
 
 	struct Fragment {
-		glm::vec3 color;
+		glm::vec3 color; // color == (texcoord0 + dev_diffuseTex)
 
 		// TODO: add new attributes to your Fragment
 		// The attributes listed below might be useful, 
 		// but always feel free to modify on your own
 
-		// glm::vec3 eyePos;	// eye space position used for shading
-		// glm::vec3 eyeNor;
-		// VertexAttributeTexcoord texcoord0;
-		// TextureData* dev_diffuseTex;
+		glm::vec3 eyePos;	// eye space position used for shading
+		glm::vec3 eyeNor;
+
+		// We have directily read color from texture and store color in glm::vec3 color
+		// so we don't want uv and texture anymore
+
+		//glm::vec2 texcoord0;
+		//TextureData* dev_diffuseTex;
 		// ...
 	};
 
@@ -106,25 +154,53 @@ static int height = 0;
 
 static int totalNumPrimitives = 0;
 static Primitive *dev_primitives = NULL;
+
+
+#ifdef BACKFACE_CULLING_IN_PIPELINE
+static Primitive *dev_primitives_after_backfaceCulling = NULL;
+#endif 
+
 static Fragment *dev_fragmentBuffer = NULL;
+
 static glm::vec3 *dev_framebuffer = NULL;
 
+
+//Used in post-processing
+static glm::vec3 *dev_framebuffer1 = NULL;
+static glm::vec3 *dev_framebuffer_DownScaleBy10 = NULL;
+static glm::vec3 *dev_framebuffer_DownScaleBy10_2 = NULL;
+
+
 static int * dev_depth = NULL;	// you might need this buffer when doing depth test
 
+
+
 /**
  * Kernel that writes the image to the OpenGL PBO directly.
  */
 __global__ 
-void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
+void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image, int framebufferEdgeOffset, int downScale_w, int downScaleRate) {
     int x = (blockIdx.x * blockDim.x) + threadIdx.x;
     int y = (blockIdx.y * blockDim.y) + threadIdx.y;
     int index = x + (y * w);
 
     if (x < w && y < h) {
+
+		int framebufferIndex;
+
+		if (downScaleRate == 1)
+		{
+			framebufferIndex = x + (y * w);
+		}
+		else {
+			// for downscale frame buffer debug
+			framebufferIndex = (x / downScaleRate) + framebufferEdgeOffset + (((y / downScaleRate) + framebufferEdgeOffset) * (downScale_w + 2 * framebufferEdgeOffset));
+		}
+
         glm::vec3 color;
-        color.x = glm::clamp(image[index].x, 0.0f, 1.0f) * 255.0;
-        color.y = glm::clamp(image[index].y, 0.0f, 1.0f) * 255.0;
-        color.z = glm::clamp(image[index].z, 0.0f, 1.0f) * 255.0;
+        color.x = glm::clamp(image[framebufferIndex].x, 0.0f, 1.0f) * 255.0;
+        color.y = glm::clamp(image[framebufferIndex].y, 0.0f, 1.0f) * 255.0;
+        color.z = glm::clamp(image[framebufferIndex].z, 0.0f, 1.0f) * 255.0;
         // Each thread writes one pixel location in the texture (textel)
         pbo[index].w = 0;
         pbo[index].x = color.x;
@@ -133,36 +209,351 @@ void sendImageToPBO(uchar4 *pbo, int w, int h, glm::vec3 *image) {
     }
 }
 
+#if defined(SSAAx2) || defined(MSAAx2)
+
+// w, h should be Nx downscale image size
+// image should be supersample buffer data
+__global__
+void sendImageToPBO_AAxN(uchar4 *pbo, int w, int h, glm::vec3 *image, int SSAA_Rate) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < w && y < h) {
+
+		int index = x + (y * w);
+
+		float totalSubSampleNumber = (float)SSAA_Rate * (float)SSAA_Rate;
+
+		glm::vec3 color = glm::vec3(0.0f);
+		for (int i = 0; i < SSAA_Rate; i++) {
+			for (int j = 0; j < SSAA_Rate; j++) {
+				int subSamplePixelX = SSAA_Rate * x + i;
+				int subSamplePixelY = SSAA_Rate * y + j;
+
+				int subSamplePixelIndex = subSamplePixelX + (subSamplePixelY * w * SSAA_Rate);
+				color.x += glm::clamp(image[subSamplePixelIndex].x, 0.0f, 1.0f) * 255.0;
+				color.y += glm::clamp(image[subSamplePixelIndex].y, 0.0f, 1.0f) * 255.0;
+				color.z += glm::clamp(image[subSamplePixelIndex].z, 0.0f, 1.0f) * 255.0;
+			}
+		}
+		
+		// Each thread writes one pixel location in the texture (textel)
+		pbo[index].w = 0;
+		pbo[index].x = color.x / totalSubSampleNumber;
+		pbo[index].y = color.y / totalSubSampleNumber;
+		pbo[index].z = color.z / totalSubSampleNumber;
+	}
+}
+#endif
+
+
 /** 
 * Writes fragment colors to the framebuffer
 */
 __global__
-void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
+void render(int w, int h, glm::vec3 lightPos, Fragment *fragmentBuffer, glm::vec3 *framebuffer, int renderMode, int framebufferEdgeOffset) {
     int x = (blockIdx.x * blockDim.x) + threadIdx.x;
     int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-    int index = x + (y * w);
 
     if (x < w && y < h) {
-        framebuffer[index] = fragmentBuffer[index].color;
+		int index = x + (y * w);
 
 		// TODO: add your fragment shader code here
+		Fragment thisFragment = fragmentBuffer[index];
+
+		// whole Triangle render mode
+		if (renderMode == 1) {
+			// Lambert shading
+
+			glm::vec3 lightVec = lightPos - thisFragment.eyePos;
+			lightVec = glm::normalize(lightVec);
 
+			float light_cosTheta = glm::min(glm::max(glm::dot(thisFragment.eyeNor, lightVec), 0.0f), 1.0f);
+
+			float ambientTerm = 0.6f;
+
+			float light_power = 3.0f;
+
+			float light_intensity = light_power * light_cosTheta + ambientTerm; // add ambient term so that we can still see points that are not lit by point light 
+
+			framebuffer[index] = light_intensity * thisFragment.color;
+
+		}
+
+		// wireframe or point mode
+		if (renderMode == 2 || renderMode == 3) {
+			framebuffer[index] = thisFragment.color;
+		}
     }
 }
 
+// Post-processing stage
+__global__
+void horizontalGaussianBlur(int w, int h, glm::vec3 *framebuffer_in, glm::vec3 *framebuffer_out, int framebufferEdgeOffset) {
+	
+#ifdef GAUSSIANBLUR_SHAREDMEMORY
+	//array size should be blocksize.y * (framebufferEdgeOffset + blocksize.x + framebufferEdgeOffset)
+	//In our case -> 8 * (5 + 8 + 5) -> 144
+	__shared__ glm::vec3 framebuffer_in_shared[144];
+
+#endif
+	
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < w && y < h) {
+		
+		int framebufferIndex = (x + framebufferEdgeOffset) + ((y + framebufferEdgeOffset) * (w + 2 * framebufferEdgeOffset));
+		framebuffer_out[framebufferIndex] = glm::vec3(0.f);
+
+#ifdef GAUSSIANBLUR_SHAREDMEMORY
+		// framebufferEdgeOffset + blocksize.x + framebufferEdgeOffset
+		// 18 -> 5 + 8 + 5
+		int index = threadIdx.y * 18 + threadIdx.x + 5;
+
+		if (threadIdx.x == 0) {
+			framebuffer_in_shared[index - 5] = framebuffer_in[framebufferIndex - 5];
+			framebuffer_in_shared[index - 4] = framebuffer_in[framebufferIndex - 4];
+			framebuffer_in_shared[index - 3] = framebuffer_in[framebufferIndex - 3];
+			framebuffer_in_shared[index - 2] = framebuffer_in[framebufferIndex - 2];
+			framebuffer_in_shared[index - 1] = framebuffer_in[framebufferIndex - 1];
+		}
+
+		if (threadIdx.x == blockDim.x - 1) {
+			framebuffer_in_shared[index + 1] = framebuffer_in[framebufferIndex + 1];
+			framebuffer_in_shared[index + 2] = framebuffer_in[framebufferIndex + 2];
+			framebuffer_in_shared[index + 3] = framebuffer_in[framebufferIndex + 3];
+			framebuffer_in_shared[index + 4] = framebuffer_in[framebufferIndex + 4];
+			framebuffer_in_shared[index + 5] = framebuffer_in[framebufferIndex + 5];
+		}
+
+		framebuffer_in_shared[index] = framebuffer_in[framebufferIndex];
+
+		__syncthreads();
+
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index - 5] * 0.0093f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index - 4] * 0.028002f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index - 3] * 0.065984f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index - 2] * 0.121703f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index - 1] * 0.175713f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index] * 0.198596f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index + 1] * 0.175713f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index + 2] * 0.121703f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index + 3] * 0.065984f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index + 4] * 0.028002f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index + 5] * 0.0093f;
+
+
+#else
+		
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex - 5] * 0.0093f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex - 4] * 0.028002f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex - 3] * 0.065984f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex - 2] * 0.121703f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex - 1] * 0.175713f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex] * 0.198596f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex + 1] * 0.175713f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex + 2] * 0.121703f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex + 3] * 0.065984f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex + 4] * 0.028002f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex + 5] * 0.0093f;
+#endif
+	}
+}
+
+__global__
+void verticalGaussianBlur(int w, int h, glm::vec3 *framebuffer_in, glm::vec3 *framebuffer_out, int framebufferEdgeOffset) {
+
+#ifdef GAUSSIANBLUR_SHAREDMEMORY
+	//array size should be blocksize.x * (framebufferEdgeOffset + blocksize.y + framebufferEdgeOffset)
+	//In our case -> 8 * (5 + 8 + 5) -> 144
+	__shared__ glm::vec3 framebuffer_in_shared[144];
+
+#endif
+
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < w && y < h) {
+
+		int framebufferIndex = (x + framebufferEdgeOffset) + ((y + framebufferEdgeOffset) * (w + 2 * framebufferEdgeOffset));
+		framebuffer_out[framebufferIndex] = glm::vec3(0.f);
+
+		int numOfelementsOneRow = w + 2 * framebufferEdgeOffset;
+
+#ifdef GAUSSIANBLUR_SHAREDMEMORY
+		// blocksize.x
+		// 8
+		int index = (threadIdx.y + 5) * 8 + threadIdx.x;
+
+
+		if (threadIdx.y == 0) {
+			// 40, 32, 24... -> 5 * blocksize.x          
+			framebuffer_in_shared[index - 40] = framebuffer_in[framebufferIndex - 5 * numOfelementsOneRow];
+			framebuffer_in_shared[index - 32] = framebuffer_in[framebufferIndex - 4 * numOfelementsOneRow];
+			framebuffer_in_shared[index - 24] = framebuffer_in[framebufferIndex - 3 * numOfelementsOneRow];
+			framebuffer_in_shared[index - 16] = framebuffer_in[framebufferIndex - 2 * numOfelementsOneRow];
+			framebuffer_in_shared[index - 8] = framebuffer_in[framebufferIndex - 1 * numOfelementsOneRow];
+		}
+
+		if (threadIdx.y == blockDim.y - 1) {
+			framebuffer_in_shared[index + 8] = framebuffer_in[framebufferIndex + 1 * numOfelementsOneRow];
+			framebuffer_in_shared[index + 16] = framebuffer_in[framebufferIndex + 2 * numOfelementsOneRow];
+			framebuffer_in_shared[index + 24] = framebuffer_in[framebufferIndex + 3 * numOfelementsOneRow];
+			framebuffer_in_shared[index + 32] = framebuffer_in[framebufferIndex + 4 * numOfelementsOneRow];
+			framebuffer_in_shared[index + 40] = framebuffer_in[framebufferIndex + 5 * numOfelementsOneRow];
+		}
+
+		framebuffer_in_shared[index] = framebuffer_in[framebufferIndex];
+
+		__syncthreads();
+
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index - 40] * 0.0093f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index - 32] * 0.028002f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index - 24] * 0.065984f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index - 16] * 0.121703f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index - 8] * 0.175713f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index] * 0.198596f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index + 8] * 0.175713f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index + 16] * 0.121703f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index + 24] * 0.065984f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index + 32] * 0.028002f;
+		framebuffer_out[framebufferIndex] += framebuffer_in_shared[index + 40] * 0.0093f;
+
+
+#else
+
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex - 5 * numOfelementsOneRow] * 0.0093f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex - 4 * numOfelementsOneRow] * 0.028002f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex - 3 * numOfelementsOneRow] * 0.065984f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex - 2 * numOfelementsOneRow] * 0.121703f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex - 1 * numOfelementsOneRow] * 0.175713f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex] * 0.198596f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex + 1 * numOfelementsOneRow] * 0.175713f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex + 2 * numOfelementsOneRow] * 0.121703f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex + 3 * numOfelementsOneRow] * 0.065984f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex + 4 * numOfelementsOneRow] * 0.028002f;
+		framebuffer_out[framebufferIndex] += framebuffer_in[framebufferIndex + 5 * numOfelementsOneRow] * 0.0093f;
+#endif
+	}
+}
+
+// downScaleRate should compatible with downScale_w & downScale_h
+__global__
+void sampleDownScaleSample(int downScale_w, int downScale_h, int downScaleRate, 
+						   int w, int h, 
+						   glm::vec3 *downScale_framebuffer, glm::vec3 *framebuffer, int framebufferEdgeOffset)
+{
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < downScale_w && y < downScale_h) {
+		int index = (x + framebufferEdgeOffset) + ((y + framebufferEdgeOffset) * (downScale_w + 2 * framebufferEdgeOffset));
+
+		glm::vec3& thisFrameBufferCol = downScale_framebuffer[index];
+		thisFrameBufferCol = glm::vec3(0.0f);
+
+		float totalSampleNumber = (float)downScaleRate * (float)downScaleRate;
+
+		int ori_framebuffer_x, ori_framebuffer_y;
+		int ori_framebuffer_index;
+
+		for (int i = 0; i < downScaleRate; i++) {
+			for (int j = 0; j < downScaleRate; j++) {
+				ori_framebuffer_x = x * downScaleRate + i;
+				ori_framebuffer_y = y * downScaleRate + j;
+
+				ori_framebuffer_x = glm::clamp(ori_framebuffer_x, 0, w - 1);
+				ori_framebuffer_y = glm::clamp(ori_framebuffer_y, 0, h - 1);
+
+				ori_framebuffer_index = (ori_framebuffer_x) + (ori_framebuffer_y * w);
+
+				thisFrameBufferCol += framebuffer[ori_framebuffer_index];
+			}
+		}
+		//take the average value of samples
+		thisFrameBufferCol *= (1.0f / totalSampleNumber);
+	}
+}
+
+
+__global__
+void brightFilter(int w, int h, glm::vec3 *framebuffer_in, glm::vec3 *framebuffer_out) {
+
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < w && y < h) {
+		int index = (x) + ((y) * (w));
+		glm::vec3 thisFrameBuffer_in = framebuffer_in[index];
+
+		float brightness = thisFrameBuffer_in.r * 0.2126f + thisFrameBuffer_in.g * 0.7152f + thisFrameBuffer_in.b * 0.0722f;
+		framebuffer_out[index] = brightness * thisFrameBuffer_in;
+	}
+}
+
+__global__
+void combineFrameBuffer(int w, int h, glm::vec3 *mainScene_framebuffer, glm::vec3 *other_framebuffer, glm::vec3 *framebuffer_out, 
+						int other_framebuffer_downScale_w, int other_framebuffer_downScaleRate,
+					    int framebufferEdgeOffset) {
+
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < w && y < h) {
+		int mainSceneIdx = x + (y * w);
+		glm::vec3 thisMainSceneFrameBufferCol = mainScene_framebuffer[mainSceneIdx];
+
+		int other_framebufferIdx = (x / other_framebuffer_downScaleRate) + framebufferEdgeOffset + 
+							     (((y / other_framebuffer_downScaleRate) + framebufferEdgeOffset) * 
+								   (other_framebuffer_downScale_w + 2 * framebufferEdgeOffset));
+
+
+		glm::vec3 otherFrameBufferColor = other_framebuffer[other_framebufferIdx];
+
+		framebuffer_out[mainSceneIdx] = thisMainSceneFrameBufferCol + 1.0f * otherFrameBufferColor;
+	}
+}
+
+
 /**
  * Called once at the beginning of the program to allocate memory.
  */
+
+int GaussianBlurEdgeRoom = 5;
+
 void rasterizeInit(int w, int h) {
-    width = w;
-    height = h;
+
+#if defined(SSAAx2) || defined(MSAAx2)
+	width = 2 * w;
+	height = 2 * h;
+#else
+	width = w;
+	height = h;
+#endif
+
+    
 	cudaFree(dev_fragmentBuffer);
 	cudaMalloc(&dev_fragmentBuffer, width * height * sizeof(Fragment));
 	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
+
     cudaFree(dev_framebuffer);
-    cudaMalloc(&dev_framebuffer,   width * height * sizeof(glm::vec3));
-    cudaMemset(dev_framebuffer, 0, width * height * sizeof(glm::vec3));
-    
+    cudaMalloc(&dev_framebuffer,   (width) * (height) * sizeof(glm::vec3));
+    cudaMemset(dev_framebuffer, 0, (width) * (height) * sizeof(glm::vec3));
+
+	cudaFree(dev_framebuffer1);
+	cudaMalloc(&dev_framebuffer1, (width) * (height) * sizeof(glm::vec3));
+	cudaMemset(dev_framebuffer1, 0, (width) * (height) * sizeof(glm::vec3));
+
+	int downScaleRate = 10;
+	cudaFree(dev_framebuffer_DownScaleBy10);
+	cudaMalloc(&dev_framebuffer_DownScaleBy10, ((width / downScaleRate) + 2 * GaussianBlurEdgeRoom) * ((height / downScaleRate) + 2 * GaussianBlurEdgeRoom) * sizeof(glm::vec3));
+	cudaMemset(dev_framebuffer_DownScaleBy10, 0, ((width / downScaleRate) + 2 * GaussianBlurEdgeRoom) * ((height / downScaleRate) + 2 * GaussianBlurEdgeRoom) * sizeof(glm::vec3));
+
+	cudaFree(dev_framebuffer_DownScaleBy10_2);
+	cudaMalloc(&dev_framebuffer_DownScaleBy10_2, ((width / downScaleRate) + 2 * GaussianBlurEdgeRoom) * ((height / downScaleRate) + 2 * GaussianBlurEdgeRoom) * sizeof(glm::vec3));
+	cudaMemset(dev_framebuffer_DownScaleBy10_2, 0, ((width / downScaleRate) + 2 * GaussianBlurEdgeRoom) * ((height / downScaleRate) + 2 * GaussianBlurEdgeRoom) * sizeof(glm::vec3));
+
 	cudaFree(dev_depth);
 	cudaMalloc(&dev_depth, width * height * sizeof(int));
 
@@ -600,6 +991,10 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 	// 3. Malloc for dev_primitives
 	{
 		cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive));
+
+#ifdef BACKFACE_CULLING_IN_PIPELINE
+		cudaMalloc(&dev_primitives_after_backfaceCulling, totalNumPrimitives * sizeof(Primitive));
+#endif
 	}
 	
 
@@ -628,12 +1023,22 @@ void _vertexTransformAndAssembly(
 	int numVertices, 
 	PrimitiveDevBufPointers primitive, 
 	glm::mat4 MVP, glm::mat4 MV, glm::mat3 MV_normal, 
-	int width, int height) {
+	int width, int height,
+	glm::mat4 selfRotateM) 
+{
 
 	// vertex id
 	int vid = (blockIdx.x * blockDim.x) + threadIdx.x;
 	if (vid < numVertices) {
 
+		__shared__ glm::mat4 _selfRotateM;
+
+		if (threadIdx.x == 0) {
+			_selfRotateM = selfRotateM;
+		}
+
+		__syncthreads();
+
 		// TODO: Apply vertex transformation here
 		// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
 		// Then divide the pos by its w element to transform into NDC space
@@ -641,11 +1046,56 @@ void _vertexTransformAndAssembly(
 
 		// TODO: Apply vertex assembly here
 		// Assemble all attribute arraies into the primitive array
-		
-	}
-}
 
+		VertexOut& this_dev_verticesOut = primitive.dev_verticesOut[vid];
+
+		// Handle screen space postion
+		glm::vec4 ClippingSpace_Pos = MVP * _selfRotateM * glm::vec4(primitive.dev_position[vid], 1.0f);
+		glm::vec4 NDC_Pos = (1.0f / ClippingSpace_Pos.w) * ClippingSpace_Pos;
+		glm::vec4 ScreenSpace_Pos = glm::vec4((NDC_Pos.x + 1.0f) * (float)width / 2.0f, 
+											  (1.0f - NDC_Pos.y) * (float)height / 2.0f, 
+											  NDC_Pos.z,
+											  NDC_Pos.w);
+
+
+		this_dev_verticesOut.pos = ScreenSpace_Pos;
+
+		// Handle eye space postion
+		this_dev_verticesOut.eyePos = glm::vec3(MV * _selfRotateM * glm::vec4(primitive.dev_position[vid], 1.0f));
 
+		// Handle eye space normal
+		this_dev_verticesOut.eyeNor = glm::normalize(MV_normal * glm::mat3(_selfRotateM) * primitive.dev_normal[vid]); // normalized
+
+		// Handle uv
+		if (primitive.dev_texcoord0 != NULL) {
+			this_dev_verticesOut.texcoord0 = primitive.dev_texcoord0[vid];
+		}
+		else {
+			this_dev_verticesOut.texcoord0 = glm::vec2(0.0f); //set a default value, in case of some uninitialized error
+		}
+
+		// Handle diffuse texture
+		if (primitive.dev_diffuseTex != NULL) {
+			//Assume all vertices use just one diffuse texture
+			this_dev_verticesOut.dev_diffuseTex = primitive.dev_diffuseTex;
+			this_dev_verticesOut.diffuseTexWidth = primitive.diffuseTexWidth;
+			this_dev_verticesOut.diffuseTexHeight = primitive.diffuseTexHeight;
+		}
+
+#ifdef CORRECT_COLOR_LERP
+		if (vid % 3 == 0) {
+			this_dev_verticesOut.col = glm::vec3(0.95f, 0.25f, 0.25f);
+		}
+		else if (vid % 3 == 1) {
+			this_dev_verticesOut.col = glm::vec3(0.25f, 0.95f, 0.25f);
+		}
+		else if (vid % 3 == 2) {
+			this_dev_verticesOut.col = glm::vec3(0.25f, 0.25f, 0.95f);
+		}
+#endif 
+
+	}
+}
 
 static int curPrimitiveBeginId = 0;
 
@@ -660,12 +1110,12 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
 		// TODO: uncomment the following code for a start
 		// This is primitive assembly for triangles
 
-		//int pid;	// id for cur primitives vector
-		//if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
-		//	pid = iid / (int)primitive.primitiveType;
-		//	dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
-		//		= primitive.dev_verticesOut[primitive.dev_indices[iid]];
-		//}
+		int pid;	// id for cur primitives vector
+		if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
+			pid = iid / (int)primitive.primitiveType;
+			dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
+				= primitive.dev_verticesOut[primitive.dev_indices[iid]];
+		}
 
 
 		// TODO: other primitive types (point, line)
@@ -675,14 +1125,494 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
 
 
 
+
+__device__
+void fillThisFragmentBuffer(Fragment& thisFragment,
+						    glm::vec3 p, 
+							glm::vec3 t1, glm::vec3 t2, glm::vec3 t3,
+							VertexOut v1, VertexOut v2, VertexOut v3)
+{
+	//Assume all vertices in one primitive use one Diffuse texture
+	int diffuseTexWidth = v1.diffuseTexWidth;
+	int diffuseTexHeight = v1.diffuseTexHeight;
+	TextureData* textureData = v1.dev_diffuseTex;
+
+	// Handle Positions (assume gltf always has this property)
+	glm::vec3 lerp_eyePos = vec3AttributePersCorrectionLerp(
+		p,
+		t1, t2, t3,
+		v1.eyePos, v2.eyePos, v3.eyePos);
+
+	thisFragment.eyePos = lerp_eyePos;
+
+
+	// Handle Normals (assume gltf always has this property)
+	glm::vec3 lerp_eyeNor = vec3AttributePersCorrectionLerp(
+		p,
+		t1, t2, t3,
+		v1.eyeNor, v2.eyeNor, v3.eyeNor);
+
+	lerp_eyeNor = glm::normalize(lerp_eyeNor); // normalized
+
+	thisFragment.eyeNor = lerp_eyeNor;
+
+
+	// Handle UV (assume gltf always has this property)
+	glm::vec2 lerp_uv = vec2AttributePersCorrectionLerp(
+		p,
+		t1, t2, t3,
+		v1.texcoord0, v2.texcoord0, v3.texcoord0);
+
+	// Fetch color from texture
+	if (textureData != NULL) {
+		TextureData r, g, b;
+
+#ifdef BILINEAR_TEXTURE_FILTER
+		lerp_uv.x = glm::clamp(lerp_uv.x * diffuseTexWidth  - 0.5f, 0.f, diffuseTexWidth  - 1.0f);
+		lerp_uv.y = glm::clamp(lerp_uv.y * diffuseTexHeight - 0.5f, 0.f, diffuseTexHeight - 1.0f);
+
+		float x = glm::floor(lerp_uv.x);
+		float y = glm::floor(lerp_uv.y);
+
+		float u_ratio = lerp_uv.x - x;
+		float v_ratio = lerp_uv.y - y;
+		float u_opposite = 1.0f - u_ratio;
+		float v_opposite = 1.0f - v_ratio;
+
+		int textIdx1 = x + diffuseTexWidth * y;
+		int textIdx2 = glm::clamp(x + 1.0f, 0.f, diffuseTexWidth - 1.0f) + diffuseTexWidth * y;
+		int textIdx3 = x + diffuseTexWidth * glm::clamp(y + 1.0f, 0.f, diffuseTexHeight - 1.0f);
+		int textIdx4 = glm::clamp(x + 1.0f, 0.f, diffuseTexWidth - 1.0f) + diffuseTexWidth * glm::clamp(y + 1.0f, 0.f, diffuseTexHeight - 1.0f);
+
+		int numOfTextureChannels = 3;
+
+		r = (u_opposite * textureData[textIdx1 * numOfTextureChannels] + u_ratio * textureData[textIdx2 * numOfTextureChannels]) * v_opposite
+		  + (u_opposite * textureData[textIdx3 * numOfTextureChannels] + u_ratio * textureData[textIdx4 * numOfTextureChannels]) * v_ratio;
+
+		g = (u_opposite * textureData[textIdx1 * numOfTextureChannels + 1] + u_ratio * textureData[textIdx2 * numOfTextureChannels + 1]) * v_opposite
+		  + (u_opposite * textureData[textIdx3 * numOfTextureChannels + 1] + u_ratio * textureData[textIdx4 * numOfTextureChannels + 1]) * v_ratio;
+		
+		b = (u_opposite * textureData[textIdx1 * numOfTextureChannels + 2] + u_ratio * textureData[textIdx2 * numOfTextureChannels + 2]) * v_opposite
+		  + (u_opposite * textureData[textIdx3 * numOfTextureChannels + 2] + u_ratio * textureData[textIdx4 * numOfTextureChannels + 2]) * v_ratio;
+
+
+#else
+		glm::ivec2 textSpaceCoord = glm::ivec2(diffuseTexWidth * lerp_uv.x, diffuseTexHeight * (lerp_uv.y));
+
+		int textIdx = textSpaceCoord.x + diffuseTexWidth * textSpaceCoord.y;
+
+		// Assume texture data are row major
+		// and there are 3 channels
+		int numOfTextureChannels = 3;
+		r = textureData[textIdx * numOfTextureChannels];
+		g = textureData[textIdx * numOfTextureChannels + 1];
+		b = textureData[textIdx * numOfTextureChannels + 2];
+
+#endif // BILINEAR_TEXTURE_FILTER
+
+		thisFragment.color = glm::vec3((float)r / 255.0f, (float)g / 255.0f, (float)b / 255.0f);
+	}
+
+	else {
+		// Debug normal
+		//thisFragment.color = glm::vec3(0.5f * (lerp_eyeNor.x + 1.0f),
+		//						       0.5f * (lerp_eyeNor.y + 1.0f),
+		//							   0.5f * (lerp_eyeNor.z + 1.0f));
+#ifdef CORRECT_COLOR_LERP
+		// Handle Colors
+		glm::vec3 lerp_col = vec3AttributePersCorrectionLerp(
+			p,
+			t1, t2, t3,
+			v1.col, v2.col, v3.col);
+
+		thisFragment.color = lerp_col;
+#else
+		thisFragment.color = glm::vec3(0.95f, 0.95f, 0.95f);
+#endif
+	}
+}
+
+
+// Rasterizer - Fill method
+// whole triangle mode
+__device__
+void rasterizer_fill_wholeTriangleMode(Fragment* fragmentBuffer, Primitive& thisPrimitive, int* depth,
+									   glm::vec3 t1, glm::vec3 t2, glm::vec3 t3,
+									   int w, int h) 
+{	
+	//Use AABB
+	float minX = fminf(t1.x, fminf(t2.x, t3.x));
+	float maxX = fmaxf(t1.x, fmaxf(t2.x, t3.x));
+	float minY = fminf(t1.y, fminf(t2.y, t3.y));
+	float maxY = fmaxf(t1.y, fmaxf(t2.y, t3.y));
+
+	// make sure AABB is inside screen
+	int startX = minX < 0 ? 0 : (int)glm::floor(minX);
+	int endX   = maxX > w ? w : (int)glm::ceil(maxX);
+
+	int startY = minY < 0 ? 0 : (int)glm::floor(minY);
+	int endY   = maxY > h ? h : (int)glm::ceil(maxY);
+
+#ifdef MSAAx2
+	for (int i = startY; i <= endY; i += 2) {
+		for (int j = startX; j <= endX; j += 2) {
+
+			// if point is on(very close, depends on epsilon) the edge of triangle
+			if (isPointOnTriangleEdge(glm::vec2(j + 0.5f, i + 0.5f), t1, t2, t3)) {
+				// do Multi sample
+				for (int p = 0; p < 2; p++) {
+					for (int q = 0; q < 2; q++) {
+						float lerp_depth = depthValuePersCorrectionLerp(glm::vec3(j + q, i + p, 0.f), t1, t2, t3);
+						int lerp_depth_int = (int)(lerp_depth * 100000.0f);
+						// Atomic depth buffer writing
+						int fragmentIdx = (j + q) + ((i + p) * w);
+						int old = depth[fragmentIdx];
+						int assumed;
+
+						do {
+							assumed = old;
+							old = atomicMin(&depth[fragmentIdx], lerp_depth_int);
+						} while (assumed != old);
+
+						//must use depth[index] to read again!
+						if (lerp_depth_int <= depth[fragmentIdx]) {
+							// pass depth test, this fragment is good, we will use it
+							glm::vec3 p(j + q, i + p, lerp_depth);
+
+							// fill this fragment Buffer
+							fillThisFragmentBuffer(fragmentBuffer[fragmentIdx],
+								p,
+								t1, t2, t3,
+								thisPrimitive.v[0], thisPrimitive.v[1], thisPrimitive.v[2]);
+						}
+					}
+				}
+			}
+
+			// if point is not on the edge of triangle
+			// but if it's inside the tirangle
+			else if (isPosInTriange(glm::vec3(j + 0.5f, i + 0.5f, 0.f), t1, t2, t3)) {
+				float lerp_depth = depthValuePersCorrectionLerp(glm::vec3(j + 0.5f, i + 0.5f, 0.f), t1, t2, t3);
+				int lerp_depth_int = (int)(lerp_depth * 100000.0f);
+				
+				glm::vec3 p(j + 0.5f, i + 0.5f, lerp_depth);
+
+				// ----------- fill sub-Sample 1 -----------------
+				// Atomic depth buffer writing
+				int fragmentIdx = j + (i * w);
+				int old = depth[fragmentIdx];
+				int assumed;
+				do {
+					assumed = old;
+					old = atomicMin(&depth[fragmentIdx], lerp_depth_int);
+				} while (assumed != old);
+
+				if (lerp_depth_int <= depth[fragmentIdx]) {
+					// fill this fragment Buffer
+					fillThisFragmentBuffer(fragmentBuffer[fragmentIdx],
+						p,
+						t1, t2, t3,
+						thisPrimitive.v[0], thisPrimitive.v[1], thisPrimitive.v[2]);
+				}
+
+				// ----------- fill sub-Sample 2 -----------------
+				old = depth[fragmentIdx + 1];
+				do {
+					assumed = old;
+					old = atomicMin(&depth[fragmentIdx + 1], lerp_depth_int);
+				} while (assumed != old);
+				if (lerp_depth_int <= depth[fragmentIdx + 1]) {
+					// fill this fragment Buffer
+					fragmentBuffer[fragmentIdx + 1] = fragmentBuffer[fragmentIdx];
+				}
+
+				// ----------- fill sub-Sample 3 -----------------
+				old = depth[fragmentIdx + w];
+				do {
+					assumed = old;
+					old = atomicMin(&depth[fragmentIdx + w], lerp_depth_int);
+				} while (assumed != old);
+				if (lerp_depth_int <= depth[fragmentIdx + w]) {
+					// fill this fragment Buffer
+					fragmentBuffer[fragmentIdx + w] = fragmentBuffer[fragmentIdx];
+				}
+
+				// ----------- fill sub-Sample 4 -----------------
+				old = depth[fragmentIdx + w + 1];
+				do {
+					assumed = old;
+					old = atomicMin(&depth[fragmentIdx + w + 1], lerp_depth_int);
+				} while (assumed != old);
+				if (lerp_depth_int <= depth[fragmentIdx + w + 1]) {
+					// fill this fragment Buffer
+					fragmentBuffer[fragmentIdx + w + 1] = fragmentBuffer[fragmentIdx];
+
+				}
+
+				
+			}
+
+		}
+	}
+
+#else
+
+	for (int i = startY; i <= endY; i++) {
+		for (int j = startX; j <= endX; j++) {
+			// Test if this pos is in the triangle
+			if (isPosInTriange(glm::vec3(j, i, 0.f), t1, t2, t3)) {
+
+				//int fragmentIdx = j + (i * w);
+				//int lerp_depth_int = conductDepthTest(j, i, t1, t2, t3, fragmentIdx, depth);
+
+				//IMPORTANT! 
+				//Should interpolate Z (depth) value first
+				//z (depth) value is in camera(eye) space
+				float lerp_depth = depthValuePersCorrectionLerp(glm::vec3(j, i, 0.f), t1, t2, t3);
+
+				// OK... atomicMin only works for Int
+				// but we want more accuracy for our depth value
+				// so TRICK here!
+				// multiply a really large number to get accuracy
+				// just pay attention integar is between -2147483648 - 2147483647
+				// 10000 may be a acceptable number
+				int lerp_depth_int = (int)(lerp_depth * 100000.0f);
+
+				// Atomic depth buffer writing
+				int fragmentIdx = j + (i * w);
+				int old = depth[fragmentIdx];
+				int assumed;
+
+				do {
+					assumed = old;
+					old = atomicMin(&depth[fragmentIdx], lerp_depth_int);
+				} while (assumed != old);
+
+
+				//must use depth[index] to read again!
+				if (lerp_depth_int <= depth[fragmentIdx]) {
+					// pass depth test, this fragment is good, we will use it
+					glm::vec3 p((float)j, (float)i, lerp_depth);
+
+					// fill this fragment Buffer
+					fillThisFragmentBuffer(fragmentBuffer[fragmentIdx],
+						p,
+						t1, t2, t3,
+						thisPrimitive.v[0], thisPrimitive.v[1], thisPrimitive.v[2]);
+
+				}
+			}
+		}
+	}
+
+#endif // MSAAx2
+
+}
+
+// Rasterizer - Fill method
+// wireFrame mode
+__device__
+void rasterizer_fill_wireFrameMode(Fragment* fragmentBuffer, int* depth,
+								   glm::vec3 t1, glm::vec3 t2, glm::vec3 t3,
+								   int w, int h) 
+{
+	//Use AABB
+	float minX = fminf(t1.x, fminf(t2.x, t3.x));
+	float maxX = fmaxf(t1.x, fmaxf(t2.x, t3.x));
+	float minY = fminf(t1.y, fminf(t2.y, t3.y));
+	float maxY = fmaxf(t1.y, fmaxf(t2.y, t3.y));
+
+	// make sure AABB is inside screen
+	int startX = minX < 0 ? 0 : (int)glm::floor(minX);
+	int endX = maxX > w ? w : (int)glm::ceil(maxX);
+
+	int startY = minY < 0 ? 0 : (int)glm::floor(minY);
+	int endY = maxY > h ? h : (int)glm::ceil(maxY);
+
+	glm::vec3 tris[3];
+	tris[0] = t1;
+	tris[1] = t2;
+	tris[2] = t3;
+	int fragmentIdx;
+
+	float Epsilon = 0.08f; // this controls the accuracy(thickness) of each line segment
+	glm::vec3 wireFrameCol = glm::vec3(0.35f, 0.85f, 0.35f);
+
+
+	for (int i = startY; i <= endY; i++) {
+		for (int j = startX; j <= endX; j++) {
+
+			glm::vec3 barycentricCoord = calculateBarycentricCoordinate(tris, glm::vec2(j, i));
+
+			if (glm::abs(barycentricCoord.x) < Epsilon) {
+				if (barycentricCoord.y >= 0.0f && barycentricCoord.y <= 1.0f &&
+					barycentricCoord.z >= 0.0f && barycentricCoord.z <= 1.0f) {
+					fragmentIdx = j + (i * w);
+					fragmentBuffer[fragmentIdx].color = wireFrameCol;
+				}
+			}
+			else if (glm::abs(barycentricCoord.y) < Epsilon) {
+				if (barycentricCoord.x >= 0.0f && barycentricCoord.x <= 1.0f &&
+					barycentricCoord.z >= 0.0f && barycentricCoord.z <= 1.0f) {
+					fragmentIdx = j + (i * w);
+					fragmentBuffer[fragmentIdx].color = wireFrameCol;
+				}
+			}
+			else if (glm::abs(barycentricCoord.z) < Epsilon) {
+				if (barycentricCoord.y >= 0.0f && barycentricCoord.y <= 1.0f &&
+					barycentricCoord.x >= 0.0f && barycentricCoord.x <= 1.0f) {
+					fragmentIdx = j + (i * w);
+					fragmentBuffer[fragmentIdx].color = wireFrameCol;
+				}
+			}
+
+		}
+	}
+}
+
+// Rasterizer - Fill method
+// point mode
+__device__
+void rasterizer_fill_pointMode(Fragment* fragmentBuffer, int* depth,
+							   glm::vec3 t1, glm::vec3 t2, glm::vec3 t3,
+							   int w, int h)
+{
+	//Use AABB
+	float minX = fminf(t1.x, fminf(t2.x, t3.x));
+	float maxX = fmaxf(t1.x, fmaxf(t2.x, t3.x));
+	float minY = fminf(t1.y, fminf(t2.y, t3.y));
+	float maxY = fmaxf(t1.y, fmaxf(t2.y, t3.y));
+
+	// make sure AABB is inside screen
+	int startX = minX < 0 ? 0 : (int)glm::floor(minX);
+	int endX = maxX > w ? w : (int)glm::ceil(maxX);
+
+	int startY = minY < 0 ? 0 : (int)glm::floor(minY);
+	int endY = maxY > h ? h : (int)glm::ceil(maxY);
+
+	glm::vec3 tris[3];
+	tris[0] = t1;
+	tris[1] = t2;
+	tris[2] = t3;
+	int fragmentIdx;
+
+	float Epsilon = 0.08f; // this controls the accuracy(thickness) of each line segment
+	glm::vec3 pointCol = glm::vec3(0.85f, 0.85f, 0.85f);
+
+
+	for (int i = startY; i <= endY; i++) {
+		for (int j = startX; j <= endX; j++) {
+
+			glm::vec3 barycentricCoord = calculateBarycentricCoordinate(tris, glm::vec2(j, i));
+
+			if (glm::abs(barycentricCoord.x - 1.0f) < Epsilon) {
+				if (glm::abs(barycentricCoord.y) < Epsilon &&
+					glm::abs(barycentricCoord.z) < Epsilon) {
+					fragmentIdx = j + (i * w);
+					fragmentBuffer[fragmentIdx].color = pointCol;
+				}
+			}
+			else if (glm::abs(barycentricCoord.y - 1.0f) < Epsilon) {
+				if (glm::abs(barycentricCoord.x) < Epsilon &&
+					glm::abs(barycentricCoord.z) < Epsilon) {
+					fragmentIdx = j + (i * w);
+					fragmentBuffer[fragmentIdx].color = pointCol;
+				}
+			}
+			else if (glm::abs(barycentricCoord.z - 1.0f) < Epsilon) {
+				if (glm::abs(barycentricCoord.x) < Epsilon &&
+					glm::abs(barycentricCoord.y) < Epsilon) {
+					fragmentIdx = j + (i * w);
+					fragmentBuffer[fragmentIdx].color = pointCol;
+				}
+			}
+
+		}
+	}
+}
+
+// Rasterizer - Fill method
+// Goal is to fill Fragment buffer
+__global__
+void rasterizer_fill(int numPrimitives, int curPrimitiveBeginId, Primitive* primitives, Fragment* fragmentBuffer, int* depth, int w, int h, int renderMode, glm::vec3 viewForwardVec)
+{
+	int primitiveIdx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (primitiveIdx < numPrimitives) {
+
+#ifdef BACKFACE_CULLING_IN_PIPELINE
+		Primitive& thisPrimitive = primitives[primitiveIdx];
+#else
+		Primitive& thisPrimitive = primitives[primitiveIdx + curPrimitiveBeginId];
+#endif 
+
+#ifdef BACKFACE_CULLING_IN_RASTERIZER
+		// Naive Back-face culling
+		if (glm::dot(thisPrimitive.v[0].eyeNor, viewForwardVec) < 0 && 
+			glm::dot(thisPrimitive.v[1].eyeNor, viewForwardVec) < 0 &&
+			glm::dot(thisPrimitive.v[2].eyeNor, viewForwardVec) < 0) {
+			return;
+		}
+#endif
+		// need to use NDC depth value, so that all depth are realtive to near clip
+		glm::vec3 t1(thisPrimitive.v[0].pos[0], thisPrimitive.v[0].pos[1], thisPrimitive.v[0].pos[2]);
+		glm::vec3 t2(thisPrimitive.v[1].pos[0], thisPrimitive.v[1].pos[1], thisPrimitive.v[1].pos[2]);
+		glm::vec3 t3(thisPrimitive.v[2].pos[0], thisPrimitive.v[2].pos[1], thisPrimitive.v[2].pos[2]);
+
+		// Rasterize whole triangle
+		if (renderMode == 1) {
+			rasterizer_fill_wholeTriangleMode(fragmentBuffer, thisPrimitive, depth, 
+											  t1, t2, t3,
+											  w, h
+											 );
+		}
+
+		// Rasterize wireframe
+		if (renderMode == 2) {
+			rasterizer_fill_wireFrameMode(fragmentBuffer, depth,
+									      t1, t2, t3,
+										  w, h);
+		}
+
+		// Rasterize point
+		if (renderMode == 3) {
+			rasterizer_fill_pointMode(fragmentBuffer, depth,
+								      t1, t2, t3,
+									  w, h);
+		}
+
+	}
+}
+
+#ifdef BACKFACE_CULLING_IN_PIPELINE
+
+struct isBackFacing
+{	
+	glm::vec3 viewForwardVec;
+	isBackFacing(glm::vec3 vec) : viewForwardVec(vec) {};
+
+	__host__ __device__
+	bool  operator()(const Primitive x)
+	{
+		return (glm::dot(x.v[0].eyeNor, viewForwardVec) < 0 &&
+				glm::dot(x.v[1].eyeNor, viewForwardVec) < 0 &&
+				glm::dot(x.v[2].eyeNor, viewForwardVec) < 0) ;
+	}
+};
+#endif
+
+
 /**
  * Perform rasterization.
  */
-void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) {
+void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal, 
+	int renderMode, glm::mat4 selfRotateM,
+	bool openPostProcess, 
+	glm::vec3 viewForwardVec) {
     int sideLength2d = 8;
     dim3 blockSize2d(sideLength2d, sideLength2d);
     dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
-		(height - 1) / blockSize2d.y + 1);
+					  (height - 1) / blockSize2d.y + 1);
 
 	// Execute your rasterization pipeline here
 	// (See README for rasterization pipeline outline.)
@@ -702,7 +1632,7 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 				dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 				dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 
-				_vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height);
+				_vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height, selfRotateM);
 				checkCUDAError("Vertex Processing");
 				cudaDeviceSynchronize();
 				_primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
@@ -720,18 +1650,122 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 	}
 	
 	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
+
 	initDepth << <blockCount2d, blockSize2d >> >(width, height, dev_depth);
-	
-	// TODO: rasterize
 
+	// rasterize
+	{	
+		curPrimitiveBeginId = 0;
+		dim3 numThreadsPerBlock(128);
+
+		auto it = mesh2PrimitivesMap.begin();
+		auto itEnd = mesh2PrimitivesMap.end();
+
+		for (; it != itEnd; ++it) {
+			auto p = (it->second).begin();	// each primitive
+			auto pEnd = (it->second).end();
+			for (; p != pEnd; ++p) {
+				dim3 numBlocksForPrimitives((p->numPrimitives + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+				
+#ifdef BACKFACE_CULLING_IN_PIPELINE
+				// First copy Primitives to a new array
+				cudaMemcpy(dev_primitives_after_backfaceCulling, dev_primitives + curPrimitiveBeginId, p->numPrimitives * sizeof(Primitive), cudaMemcpyDeviceToDevice);
+
+				// Remove primitves facing backwards
+				thrust::device_ptr<Primitive> dev_thrust_primitves(dev_primitives_after_backfaceCulling);
+				int newPrimitiveSize = thrust::remove_if(dev_thrust_primitves, dev_thrust_primitves + p->numPrimitives, isBackFacing(viewForwardVec)) - dev_thrust_primitves;
+				
+				// Calculate new block size
+				numBlocksForPrimitives = dim3((newPrimitiveSize + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
+
+				// rasterize based on new calculated primitives array
+				rasterizer_fill << <numBlocksForPrimitives, numThreadsPerBlock >> >
+					(newPrimitiveSize, curPrimitiveBeginId,
+					 dev_primitives_after_backfaceCulling, dev_fragmentBuffer, dev_depth,
+					 width, height, renderMode,
+					 viewForwardVec);
+				checkCUDAError("rasterizer_fill");
+				cudaDeviceSynchronize();
+				curPrimitiveBeginId += p->numPrimitives;
+
+#else
+				rasterizer_fill << <numBlocksForPrimitives, numThreadsPerBlock >> > 
+					(p->numPrimitives, curPrimitiveBeginId, 
+					 dev_primitives, dev_fragmentBuffer, dev_depth, 
+					 width, height, renderMode,
+					 viewForwardVec);
+				checkCUDAError("rasterizer_fill");
+				cudaDeviceSynchronize();
+				curPrimitiveBeginId += p->numPrimitives;
+#endif
+			}
+		}
+	}
 
+	//point light position for Lambert shading
+	glm::vec3 lightPos(3.0f, 6.0f, -5.0f);
 
     // Copy depthbuffer colors into framebuffer
-	render << <blockCount2d, blockSize2d >> >(width, height, dev_fragmentBuffer, dev_framebuffer);
+	render << <blockCount2d, blockSize2d >> >(width, height, lightPos, dev_fragmentBuffer, dev_framebuffer, renderMode, GaussianBlurEdgeRoom);
 	checkCUDAError("fragment shader");
-    // Copy framebuffer into OpenGL buffer for OpenGL previewing
-    sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
-    checkCUDAError("copy render result to pbo");
+
+	if (openPostProcess) {
+		// Post-processing Stage
+
+		//---------------------- Bloom Effect Starts --------------------------------
+		// Bright Filter
+		brightFilter << <blockCount2d, blockSize2d >> > (width, height, dev_framebuffer, dev_framebuffer1);
+
+
+		// Down Scale
+		int downScaleRate = 10;
+		dim3 blockCount2d_DownScaleBy10((width / downScaleRate - 1) / blockSize2d.x + 1,
+			(height / downScaleRate - 1) / blockSize2d.y + 1);
+		sampleDownScaleSample << <blockCount2d_DownScaleBy10, blockSize2d >> > (width / downScaleRate, height / downScaleRate, downScaleRate,
+			width, height,
+			dev_framebuffer_DownScaleBy10, dev_framebuffer1, GaussianBlurEdgeRoom);
+
+
+		// GaussianBlur 11 samples horizontally and vertically in our case
+		// Make Sure blockSize2d not change, we need to decide shared memory size based on that
+		horizontalGaussianBlur << <blockCount2d_DownScaleBy10, blockSize2d >> > (width / downScaleRate, height / downScaleRate, dev_framebuffer_DownScaleBy10, dev_framebuffer_DownScaleBy10_2, GaussianBlurEdgeRoom);
+		verticalGaussianBlur << <blockCount2d_DownScaleBy10, blockSize2d >> > (width / downScaleRate, height / downScaleRate, dev_framebuffer_DownScaleBy10_2, dev_framebuffer_DownScaleBy10, GaussianBlurEdgeRoom);
+
+
+		// Combine
+		combineFrameBuffer << <blockCount2d, blockSize2d >> > (width, height,
+			dev_framebuffer, dev_framebuffer_DownScaleBy10, dev_framebuffer1,
+			width / downScaleRate, downScaleRate, GaussianBlurEdgeRoom);
+		checkCUDAError("post processing");
+		//---------------------- Bloom Effect Ends --------------------------------
+
+
+#if defined(SSAAx2) || defined(MSAAx2)
+		dim3 blockCount2d_AAx2_DownScaleBy2(((width / 2) - 1)  / blockSize2d.x + 1,
+										      ((height / 2) - 1) / blockSize2d.y + 1);
+		sendImageToPBO_AAxN << <blockCount2d_AAx2_DownScaleBy2, blockSize2d >> > (pbo, width / 2, height / 2, dev_framebuffer1, 2);
+
+#else
+		// Copy framebuffer into OpenGL buffer for OpenGL previewing
+		sendImageToPBO << <blockCount2d, blockSize2d >> > (pbo, width, height, dev_framebuffer1, GaussianBlurEdgeRoom, width / 1, 1);
+
+		// Downscale Debug
+		//sendImageToPBO << <blockCount2d, blockSize2d >> >(pbo, width, height, dev_framebuffer_DownScaleBy10, GaussianBlurEdgeRoom, width / downScaleRate, downScaleRate);
+#endif
+		checkCUDAError("copy render result to pbo");
+	}
+
+	//Ignore post processing stage
+	else {
+#if defined(SSAAx2) || defined(MSAAx2)
+		dim3 blockCount2d_AAx2_DownScaleBy2(((width / 2) - 1)  / blockSize2d.x + 1,
+											  ((height / 2) - 1) / blockSize2d.y + 1);
+		sendImageToPBO_AAxN << <blockCount2d_AAx2_DownScaleBy2, blockSize2d >> > (pbo, width / 2, height / 2, dev_framebuffer, 2);
+
+#else
+		sendImageToPBO << <blockCount2d, blockSize2d >> > (pbo, width, height, dev_framebuffer, GaussianBlurEdgeRoom, width / 1, 1);
+#endif
+	}
 }
 
 /**
@@ -763,12 +1797,26 @@ void rasterizeFree() {
     cudaFree(dev_primitives);
     dev_primitives = NULL;
 
+#ifdef BACKFACE_CULLING_IN_PIPELINE
+	cudaFree(dev_primitives_after_backfaceCulling);
+	dev_primitives_after_backfaceCulling = NULL;
+#endif
+
 	cudaFree(dev_fragmentBuffer);
 	dev_fragmentBuffer = NULL;
 
     cudaFree(dev_framebuffer);
     dev_framebuffer = NULL;
 
+	cudaFree(dev_framebuffer1);
+	dev_framebuffer1 = NULL;
+
+	cudaFree(dev_framebuffer_DownScaleBy10);
+	dev_framebuffer_DownScaleBy10 = NULL;
+
+	cudaFree(dev_framebuffer_DownScaleBy10_2);
+	dev_framebuffer_DownScaleBy10_2 = NULL;
+
 	cudaFree(dev_depth);
 	dev_depth = NULL;
 
diff --git a/src/rasterize.h b/src/rasterize.h
index 560aae9..d01591c 100644
--- a/src/rasterize.h
+++ b/src/rasterize.h
@@ -11,6 +11,7 @@
 #include <glm/glm.hpp>
 #include <glm/gtc/matrix_transform.hpp>
 #include <glm/gtx/transform.hpp>
+#include "device_launch_parameters.h"
 
 namespace tinygltf{
 	class Scene;
@@ -20,5 +21,5 @@ namespace tinygltf{
 void rasterizeInit(int width, int height);
 void rasterizeSetBuffers(const tinygltf::Scene & scene);
 
-void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal);
+void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal, int renderMode, glm::mat4 selfRotateM, bool openPostProcess, glm::vec3 viewForwardVec);
 void rasterizeFree();
diff --git a/src/rasterizeTools.h b/src/rasterizeTools.h
index 46c701e..0bfea22 100644
--- a/src/rasterizeTools.h
+++ b/src/rasterizeTools.h
@@ -11,44 +11,44 @@
 #include <cmath>
 #include <glm/glm.hpp>
 #include <util/utilityCore.hpp>
-
-struct AABB {
-    glm::vec3 min;
-    glm::vec3 max;
-};
+//
+//struct AABB {
+//    glm::vec3 min;
+//    glm::vec3 max;
+//};
 
 /**
  * Multiplies a glm::mat4 matrix and a vec4.
  */
-__host__ __device__ static
-glm::vec3 multiplyMV(glm::mat4 m, glm::vec4 v) {
-    return glm::vec3(m * v);
-}
+//__host__ __device__ static
+//glm::vec3 multiplyMV(glm::mat4 m, glm::vec4 v) {
+//    return glm::vec3(m * v);
+//}
 
 // CHECKITOUT
 /**
  * Finds the axis aligned bounding box for a given triangle.
  */
-__host__ __device__ static
-AABB getAABBForTriangle(const glm::vec3 tri[3]) {
-    AABB aabb;
-    aabb.min = glm::vec3(
-            min(min(tri[0].x, tri[1].x), tri[2].x),
-            min(min(tri[0].y, tri[1].y), tri[2].y),
-            min(min(tri[0].z, tri[1].z), tri[2].z));
-    aabb.max = glm::vec3(
-            max(max(tri[0].x, tri[1].x), tri[2].x),
-            max(max(tri[0].y, tri[1].y), tri[2].y),
-            max(max(tri[0].z, tri[1].z), tri[2].z));
-    return aabb;
-}
+//__host__ __device__ static
+//AABB getAABBForTriangle(const glm::vec3 tri[3]) {
+//    AABB aabb;
+//    aabb.min = glm::vec3(
+//            min(min(tri[0].x, tri[1].x), tri[2].x),
+//            min(min(tri[0].y, tri[1].y), tri[2].y),
+//            min(min(tri[0].z, tri[1].z), tri[2].z));
+//    aabb.max = glm::vec3(
+//            max(max(tri[0].x, tri[1].x), tri[2].x),
+//            max(max(tri[0].y, tri[1].y), tri[2].y),
+//            max(max(tri[0].z, tri[1].z), tri[2].z));
+//    return aabb;
+//}
 
 // CHECKITOUT
 /**
  * Calculate the signed area of a given triangle.
  */
 __host__ __device__ static
-float calculateSignedArea(const glm::vec3 tri[3]) {
+float calculateSignedArea(glm::vec3 tri[3]) {
     return 0.5 * ((tri[2].x - tri[0].x) * (tri[1].y - tri[0].y) - (tri[1].x - tri[0].x) * (tri[2].y - tri[0].y));
 }
 
@@ -57,7 +57,7 @@ float calculateSignedArea(const glm::vec3 tri[3]) {
  * Helper function for calculating barycentric coordinates.
  */
 __host__ __device__ static
-float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c, const glm::vec3 tri[3]) {
+float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c, glm::vec3 tri[3]) {
     glm::vec3 baryTri[3];
     baryTri[0] = glm::vec3(a, 0);
     baryTri[1] = glm::vec3(b, 0);
@@ -70,7 +70,7 @@ float calculateBarycentricCoordinateValue(glm::vec2 a, glm::vec2 b, glm::vec2 c,
  * Calculate barycentric coordinates.
  */
 __host__ __device__ static
-glm::vec3 calculateBarycentricCoordinate(const glm::vec3 tri[3], glm::vec2 point) {
+glm::vec3 calculateBarycentricCoordinate(glm::vec3 tri[3], glm::vec2 point) {
     float beta  = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), point, glm::vec2(tri[2].x, tri[2].y), tri);
     float gamma = calculateBarycentricCoordinateValue(glm::vec2(tri[0].x, tri[0].y), glm::vec2(tri[1].x, tri[1].y), point, tri);
     float alpha = 1.0 - beta - gamma;
@@ -93,9 +93,152 @@ bool isBarycentricCoordInBounds(const glm::vec3 barycentricCoord) {
  * For a given barycentric coordinate, compute the corresponding z position
  * (i.e. depth) on the triangle.
  */
-__host__ __device__ static
-float getZAtCoordinate(const glm::vec3 barycentricCoord, const glm::vec3 tri[3]) {
-    return -(barycentricCoord.x * tri[0].z
-           + barycentricCoord.y * tri[1].z
-           + barycentricCoord.z * tri[2].z);
+//__host__ __device__ static
+//float getZAtCoordinate(const glm::vec3 barycentricCoord, const glm::vec3 tri[3]) {
+//    return -(barycentricCoord.x * tri[0].z
+//           + barycentricCoord.y * tri[1].z
+//           + barycentricCoord.z * tri[2].z);
+//}
+
+//__device__
+//int conductDepthTest(int screen_space_x, int screen_space_y, glm::vec3 t1, glm::vec3 t2, glm::vec3 t3, int fragmentIdx, int* depth) {
+//	//IMPORTANT! 
+//	//Should interpolate Z (depth) value first
+//	//z (depth) value is in camera(eye) space
+//	float lerp_depth = depthValuePersCorrectionLerp(glm::vec3(screen_space_x, screen_space_y, 0.f), t1, t2, t3);
+//
+//	// OK... atomicMin only works for Int
+//	// but we want more accuracy for our depth value
+//	// so TRICK here!
+//	// multiply a really large number to get accuracy
+//	// just pay attention integar is between -2147483648 - 2147483647
+//	// 10000 may be a acceptable number
+//	int lerp_depth_int = (int)(lerp_depth * 10000.0f);
+//
+//	// Atomic depth buffer writing
+//	int old = depth[fragmentIdx];
+//	int assumed;
+//
+//	do {
+//		assumed = old;
+//		old = atomicMin(&depth[fragmentIdx], lerp_depth_int);
+//	} while (assumed != old);
+//
+//	return lerp_depth_int;
+//}
+
+// test whether position p is in the triangle formed by p1, p2, p3
+__device__
+bool isPosInTriange(glm::vec3 p,
+	glm::vec3 p1, glm::vec3 p2, glm::vec3 p3) {
+	glm::vec3 v(p[0], p[1], 0.f);
+	glm::vec3 v1(p1[0], p1[1], 0.f);
+	glm::vec3 v2(p2[0], p2[1], 0.f);
+	glm::vec3 v3(p3[0], p3[1], 0.f);
+
+	float s = 0.5f * glm::length(glm::cross(v1 - v2, v3 - v2));
+	float s1 = 0.5f * glm::length(glm::cross(v - v2, v3 - v2));
+	float s2 = 0.5f * glm::length(glm::cross(v - v3, v1 - v3));
+	float s3 = 0.5f * glm::length(glm::cross(v - v1, v2 - v1));
+
+	return glm::abs(s1 + s2 + s3 - s) < 0.1f;
+}
+
+
+// p here should be glm::vec3 (ScreenSpace.x, ScreenSpace.y, 0)
+// p1, p2, p3 here should be glm::vec3(ScreenSpace.x, ScreenSpace.y, EyeSpace.z)
+__device__
+float depthValuePersCorrectionLerp(glm::vec3 p,
+	glm::vec3 p1, glm::vec3 p2, glm::vec3 p3)
+{
+	glm::vec3 v1(p1[0], p1[1], 0.f);
+	glm::vec3 v2(p2[0], p2[1], 0.f);
+	glm::vec3 v3(p3[0], p3[1], 0.f);
+
+
+	float s = 0.5f * glm::length(glm::cross(v1 - v2, v3 - v2));
+	float s1 = 0.5f * glm::length(glm::cross(p - v2, v3 - v2));
+	float s2 = 0.5f * glm::length(glm::cross(p - v3, v1 - v3));
+	float s3 = 0.5f * glm::length(glm::cross(p - v1, v2 - v1));
+
+	return 1.0f / ((s1 / (p1[2] * s)) + (s2 / (p2[2] * s)) + (s3 / (p3[2] * s)));
 }
+
+// p, p1, p2, p3 here should be glm::vec3(ScreenSpace.x, ScreenSpace.y, EyeSpace.z)
+__device__
+glm::vec2 vec2AttributePersCorrectionLerp(glm::vec3 p,
+	glm::vec3 p1, glm::vec3 p2, glm::vec3 p3,
+	glm::vec2 attribute1, glm::vec2 attribute2, glm::vec2 attribute3)
+{
+	glm::vec3 v(p[0], p[1], 0.f);
+	glm::vec3 v1(p1[0], p1[1], 0.f);
+	glm::vec3 v2(p2[0], p2[1], 0.f);
+	glm::vec3 v3(p3[0], p3[1], 0.f);
+
+
+	float s = 0.5f * glm::length(glm::cross(v1 - v2, v3 - v2));
+	float s1 = 0.5f * glm::length(glm::cross(v - v2, v3 - v2));
+	float s2 = 0.5f * glm::length(glm::cross(v - v3, v1 - v3));
+	float s3 = 0.5f * glm::length(glm::cross(v - v1, v2 - v1));
+
+	return p[2] * ((attribute1 / p1[2]) * (s1 / s)
+		+ (attribute2 / p2[2]) * (s2 / s)
+		+ (attribute3 / p3[2]) * (s3 / s));
+}
+
+// p, p1, p2, p3 here should be glm::vec3(ScreenSpace.x, ScreenSpace.y, EyeSpace.z)
+__device__
+glm::vec3 vec3AttributePersCorrectionLerp(glm::vec3 p,
+	glm::vec3 p1, glm::vec3 p2, glm::vec3 p3,
+	glm::vec3 attribute1, glm::vec3 attribute2, glm::vec3 attribute3)
+{
+	glm::vec3 v(p[0], p[1], 0.f);
+	glm::vec3 v1(p1[0], p1[1], 0.f);
+	glm::vec3 v2(p2[0], p2[1], 0.f);
+	glm::vec3 v3(p3[0], p3[1], 0.f);
+
+
+	float s = 0.5f * glm::length(glm::cross(v1 - v2, v3 - v2));
+	float s1 = 0.5f * glm::length(glm::cross(v - v2, v3 - v2));
+	float s2 = 0.5f * glm::length(glm::cross(v - v3, v1 - v3));
+	float s3 = 0.5f * glm::length(glm::cross(v - v1, v2 - v1));
+
+	return p[2] * ((attribute1 / p1[2]) * (s1 / s)
+		+ (attribute2 / p2[2]) * (s2 / s)
+		+ (attribute3 / p3[2]) * (s3 / s));
+}
+
+
+__device__
+bool isPointOnTriangleEdge(glm::vec2 p, glm::vec3 t1, glm::vec3 t2, glm::vec3 t3) {
+
+	glm::vec3 tris[3];
+	tris[0] = t1;
+	tris[1] = t2;
+	tris[2] = t3;
+
+	glm::vec3 barycentricCoord = calculateBarycentricCoordinate(tris, p);
+
+	float Epsilon = 0.1f;
+
+	if (glm::abs(barycentricCoord.x) < Epsilon) {
+		if (barycentricCoord.y >= 0.0f && barycentricCoord.y <= 1.0f &&
+			barycentricCoord.z >= 0.0f && barycentricCoord.z <= 1.0f) {
+			return true;
+		}
+	}
+	else if (glm::abs(barycentricCoord.y) < Epsilon) {
+		if (barycentricCoord.x >= 0.0f && barycentricCoord.x <= 1.0f &&
+			barycentricCoord.z >= 0.0f && barycentricCoord.z <= 1.0f) {
+			return true;
+		}
+	}
+	else if (glm::abs(barycentricCoord.z) < Epsilon) {
+		if (barycentricCoord.y >= 0.0f && barycentricCoord.y <= 1.0f &&
+			barycentricCoord.x >= 0.0f && barycentricCoord.x <= 1.0f) {
+			return true;
+		}
+	}
+
+	return false;
+}
\ No newline at end of file