diff --git a/README.md b/README.md
index cad1abd..45358d5 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,29 @@
 CUDA Rasterizer
 ===============
 
-[CLICK ME FOR INSTRUCTION OF THIS PROJECT](./INSTRUCTION.md)
-
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Alexander Perry
+* Tested on: Windows 10, i5-2410M @ 2.30GHz 8GB, NVS 4200M (personal computer)
+
+![](./renders/duck_tex_fixed.PNG)
+![](./renders/recording.gif)
+
+### Features
+#### Shared Memory
+I used shared memory to load primitives in the rasterization function. This avoids using global memory for the most common accesses as most of the computation in this function depends on the primitive.
+
+#### Texture loading.
+I implemented correct texture mapping for both proper color and UV coordinates based on perspective. See before and after:
+
+![](./renders/checkerboard_bad_interpolation.PNG)
+![](./renders/checkerboard_fixed.PNG)
+This also includes bilinear filtering on the textures.
 
-### (TODO: Your README)
+### Analysis
+The largest optimization I made was to use multiple CUDA streams for rasterization. I launched a different kernel for each primitive to render with block size determined by the AABB of the primitive. This allows the GPU to be more efficient at scheduling the kernels as they can be scheduled in any order. I tested this by loading the duck object, not moving the camera and wating for the FPS to settle. With this optimization on, I get 22fps. With the optimization off I get 18fps. This is an improvement of 22%.
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+Another minor optimization I made was in the render function. In order to avoid computing lambertian reflection for every fragment, I allowed for early termination of fragments whose color was black. With the optimization off, the render function takes 10.24ms. With the optimaztion on, the render function takes only 5.6ms. This is a speed up of almost 50%. This was computed using the same setup as the previous test, which creates a sceme with a lot of background, which is the ideal case for the optimization.
 
 
 ### Credits
diff --git a/renders/Early_no_depth.PNG b/renders/Early_no_depth.PNG
new file mode 100644
index 0000000..1e6809c
Binary files /dev/null and b/renders/Early_no_depth.PNG differ
diff --git a/renders/checkerboard_bad_interpolation.PNG b/renders/checkerboard_bad_interpolation.PNG
new file mode 100644
index 0000000..7a262a9
Binary files /dev/null and b/renders/checkerboard_bad_interpolation.PNG differ
diff --git a/renders/checkerboard_fixed.PNG b/renders/checkerboard_fixed.PNG
new file mode 100644
index 0000000..38e5df5
Binary files /dev/null and b/renders/checkerboard_fixed.PNG differ
diff --git a/renders/depth_normals.PNG b/renders/depth_normals.PNG
new file mode 100644
index 0000000..568a296
Binary files /dev/null and b/renders/depth_normals.PNG differ
diff --git a/renders/duck_tex_fixed.PNG b/renders/duck_tex_fixed.PNG
new file mode 100644
index 0000000..f643e38
Binary files /dev/null and b/renders/duck_tex_fixed.PNG differ
diff --git a/renders/lambert_shading.PNG b/renders/lambert_shading.PNG
new file mode 100644
index 0000000..0064ed7
Binary files /dev/null and b/renders/lambert_shading.PNG differ
diff --git a/renders/recording.gif b/renders/recording.gif
new file mode 100644
index 0000000..9ab9683
Binary files /dev/null and b/renders/recording.gif differ
diff --git a/renders/texture_bug.PNG b/renders/texture_bug.PNG
new file mode 100644
index 0000000..e8a95ac
Binary files /dev/null and b/renders/texture_bug.PNG differ
diff --git a/src/rasterize.cu b/src/rasterize.cu
index 1262a09..d7bf145 100644
--- a/src/rasterize.cu
+++ b/src/rasterize.cu
@@ -18,6 +18,8 @@
 #include <glm/gtc/quaternion.hpp>
 #include <glm/gtc/matrix_transform.hpp>
 
+#define DOTS 0
+
 namespace {
 
 	typedef unsigned short VertexIndex;
@@ -43,10 +45,10 @@ namespace {
 
 		 glm::vec3 eyePos;	// eye space position used for shading
 		 glm::vec3 eyeNor;	// eye space normal used for shading, cuz normal will go wrong after perspective transformation
-		// glm::vec3 col;
+		 glm::vec3 col;
 		 glm::vec2 texcoord0;
 		 TextureData* dev_diffuseTex = NULL;
-		// int texWidth, texHeight;
+		 int texWidth, texHeight;
 		// ...
 	};
 
@@ -62,10 +64,10 @@ namespace {
 		// The attributes listed below might be useful, 
 		// but always feel free to modify on your own
 
-		// glm::vec3 eyePos;	// eye space position used for shading
-		// glm::vec3 eyeNor;
-		// VertexAttributeTexcoord texcoord0;
-		// TextureData* dev_diffuseTex;
+		glm::vec3 eyePos;	// eye space position used for shading
+		glm::vec3 eyeNor;
+		VertexAttributeTexcoord texcoord0;
+		TextureData* dev_diffuseTex;
 		// ...
 	};
 
@@ -108,8 +110,11 @@ static int totalNumPrimitives = 0;
 static Primitive *dev_primitives = NULL;
 static Fragment *dev_fragmentBuffer = NULL;
 static glm::vec3 *dev_framebuffer = NULL;
+static AABB *dev_aabbs = NULL;
 
-static int * dev_depth = NULL;	// you might need this buffer when doing depth test
+static int *dev_depth = NULL;	// you might need this buffer when doing depth test
+static float *dev_depth_f = NULL;
+static int *dev_depth_mutex = NULL ;
 
 /**
  * Kernel that writes the image to the OpenGL PBO directly.
@@ -141,12 +146,23 @@ void render(int w, int h, Fragment *fragmentBuffer, glm::vec3 *framebuffer) {
     int x = (blockIdx.x * blockDim.x) + threadIdx.x;
     int y = (blockIdx.y * blockDim.y) + threadIdx.y;
     int index = x + (y * w);
+    glm::vec3 light_pos(5, 5, 10);
+    glm::vec3 light_col(1, 1, 1);
 
     if (x < w && y < h) {
-        framebuffer[index] = fragmentBuffer[index].color;
-
-		// TODO: add your fragment shader code here
-
+      //framebuffer[index] = fragmentBuffer[index].color;
+      glm::vec3 fColor = fragmentBuffer[index].color;
+      if (fColor == glm::vec3(0)) {
+        framebuffer[index] = glm::vec3(0);
+      } else {
+
+        // TODO: add your fragment shader code here
+
+        //lambertian shading
+        glm::vec3 to_light = light_pos - fragmentBuffer[index].eyePos;
+        glm::vec3 col = glm::max(glm::vec3(0), glm::dot(glm::normalize(to_light), fragmentBuffer[index].eyeNor) * light_col) + glm::vec3(0.1);
+        framebuffer[index] = col*fragmentBuffer[index].color;
+      }
     }
 }
 
@@ -165,6 +181,10 @@ void rasterizeInit(int w, int h) {
     
 	cudaFree(dev_depth);
 	cudaMalloc(&dev_depth, width * height * sizeof(int));
+  cudaFree(dev_depth_f);
+  cudaMalloc(&dev_depth_f, width * height * sizeof(float));
+	cudaFree(dev_depth_mutex);
+	cudaMalloc(&dev_depth_mutex, width * height * sizeof(int));
 
 	checkCUDAError("rasterizeInit");
 }
@@ -182,6 +202,18 @@ void initDepth(int w, int h, int * depth)
 	}
 }
 
+__global__
+void initDepth(int w, int h, float * depth)
+{
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < w && y < h)
+	{
+		int index = x + (y * w);
+		depth[index] = 10.0f; // depth buffer is 0-1 so 10 is way beyond max.
+	}
+}
 
 /**
 * kern function with support for stride to sometimes replace cudaMemcpy
@@ -600,6 +632,7 @@ void rasterizeSetBuffers(const tinygltf::Scene & scene) {
 	// 3. Malloc for dev_primitives
 	{
 		cudaMalloc(&dev_primitives, totalNumPrimitives * sizeof(Primitive));
+    cudaMalloc(&dev_aabbs, totalNumPrimitives * sizeof(AABB));
 	}
 	
 
@@ -638,10 +671,23 @@ void _vertexTransformAndAssembly(
 		// Multiply the MVP matrix for each vertex position, this will transform everything into clipping space
 		// Then divide the pos by its w element to transform into NDC space
 		// Finally transform x and y to viewport space
+    glm::vec4 w_pos(primitive.dev_position[vid], 1.0f);
+    glm::vec4 clip_pos = MVP * w_pos;
+    clip_pos /= -clip_pos.w;
+    glm::vec4 eye_pos = MV * w_pos;
+    glm::vec3 eye_nor = MV_normal * primitive.dev_normal[vid];
 
 		// TODO: Apply vertex assembly here
-		// Assemble all attribute arraies into the primitive array
-		
+		// Assemble all attribute arrays into the primitive array
+    VertexOut* vOut = &primitive.dev_verticesOut[vid];
+    vOut->pos = clip_pos*glm::vec4(1,1,1,1);
+    vOut->eyePos = glm::vec3(eye_pos);
+    vOut->eyeNor = eye_nor;
+    vOut->col = glm::vec3(1, 0, 0); //set color to red
+    vOut->dev_diffuseTex = primitive.dev_diffuseTex;
+    vOut->texcoord0 = primitive.dev_texcoord0[vid];
+    vOut->texWidth  = primitive.diffuseTexWidth;
+    vOut->texHeight = primitive.diffuseTexHeight;
 	}
 }
 
@@ -660,12 +706,12 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
 		// TODO: uncomment the following code for a start
 		// This is primitive assembly for triangles
 
-		//int pid;	// id for cur primitives vector
-		//if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
-		//	pid = iid / (int)primitive.primitiveType;
-		//	dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
-		//		= primitive.dev_verticesOut[primitive.dev_indices[iid]];
-		//}
+		int pid;	// id for cur primitives vector
+		if (primitive.primitiveMode == TINYGLTF_MODE_TRIANGLES) {
+			pid = iid / (int)primitive.primitiveType;
+			dev_primitives[pid + curPrimitiveBeginId].v[iid % (int)primitive.primitiveType]
+				= primitive.dev_verticesOut[primitive.dev_indices[iid]];
+		}
 
 
 		// TODO: other primitive types (point, line)
@@ -673,16 +719,189 @@ void _primitiveAssembly(int numIndices, int curPrimitiveBeginId, Primitive* dev_
 	
 }
 
+__global__ void rasterize_points(int numPrimitives, int width, int height, Primitive* p, Fragment *fragments) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < numPrimitives) {
+    for (int i = 0; i < 3; ++i) {
+      glm::vec4 pos = p[index].v[i].pos;
+      glm::ivec2 pix_pos = (glm::vec2(pos) + glm::vec2(1, 1))*glm::vec2(width/2, height/2);
+      int fid = pix_pos.x + pix_pos.y * width;
+      fragments[fid].color = glm::vec3(1.0f);
+    }
+  }
+}
+
+/*__global__ void rasterize_wire(int numPrimitives, int height, int width, Primitive* p, Fragment *fragments) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < numPrimitives) {
+    for (int i = 0; i < 3; ++i) {
+      glm::vec4 pos = p[index].v[i].pos;
+      glm::ivec2 pix_pos = (glm::vec2(pos) + glm::vec2(1, 1))*glm::vec2(width/2, height/2);
+      int fid = pix_pos.x + pix_pos.y * width;
+      fragments[fid].color = glm::vec3(1.0f);
+    }
+  }
+}*/
+
+__device__ void convertToPixels(Primitive *p, glm::vec3 *tri, int width, int height) {
+  glm::vec3 pos;
+  pos = glm::vec3(p->v[0].pos);
+  tri[0] = (pos + glm::vec3(1, 1, 0)) * glm::vec3(width/2, height/2, 1);
+  pos = glm::vec3(p->v[1].pos);
+  tri[1] = (pos + glm::vec3(1, 1, 0)) * glm::vec3(width/2, height/2, 1);
+  pos = glm::vec3(p->v[2].pos);
+  tri[2] = (pos + glm::vec3(1, 1, 0)) * glm::vec3(width/2, height/2, 1);
+}
+
+__device__ void InterpolateTri(Primitive *p, glm::vec3 bary, Fragment *f, glm::vec3 tri[3], float z) {
+  f->color = (bary.x * p->v[0].col * tri[0].z +
+              bary.y * p->v[1].col * tri[1].z +
+              bary.z * p->v[2].col * tri[2].z) * -z;
+  f->eyePos = (bary.x * p->v[0].eyePos * tri[0].z +
+               bary.y * p->v[1].eyePos * tri[1].z +
+               bary.z * p->v[2].eyePos * tri[2].z) * -z;
+  f->eyeNor = (bary.x * p->v[0].eyeNor * tri[0].z +
+               bary.y * p->v[1].eyeNor * tri[1].z +
+               bary.z * p->v[2].eyeNor * tri[2].z) * -z;
+  f->texcoord0 = (bary.x * p->v[0].texcoord0 * tri[0].z +
+                  bary.y * p->v[1].texcoord0 * tri[1].z +
+                  bary.z * p->v[2].texcoord0 * tri[2].z) * -z;
+}
+
+__global__ void rasterizeTriNoShared(int width, int height, int start_x, int start_y,
+    int index, Primitive *p, Fragment *fragments, float *dev_depth,
+    int *dev_depth_mutex) {
+  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+  if (x < width && y < height) {
+    glm::vec3 tri[3];
+    convertToPixels(&p[index], tri, width, height);
+    tri[0].z = 1 / tri[0].z; tri[1].z = 1 / tri[1].z; tri[2].z = 1 / tri[2].z;
+    glm::vec2 test_point = glm::vec2(start_x + x, start_y + y);
+    if (test_point.x >= width || test_point.x < 0 || test_point.y >= height || test_point.y < 0);
+    else {
+      glm::vec3 bary = calculateBarycentricCoordinate(tri, test_point);
+      if (isBarycentricCoordInBounds(bary)) {
+        int fid = test_point.x + test_point.y * width;
+        /*int *mutex = &dev_depth_mutex[fid];
+        bool is_set;
+        do {
+          is_set = (atomicCAS(mutex, 0, 1) == 0);
+          if (is_set) {*/
+            float depth = 1/getZAtCoordinate(bary, tri);
+            if (depth < dev_depth[fid]) {
+              Fragment *f = &fragments[fid];
+              dev_depth[fid] = depth;
+              InterpolateTri(&p[index], bary, f, tri, depth);
+              if (p[index].v[0].dev_diffuseTex == NULL) {
+                fragments[fid].color = p[index].v[0].col;
+              } else {
+                glm::vec2 texcoord = f->texcoord0*glm::vec2(p[index].v[0].texWidth, p[index].v[0].texHeight);
+                glm::ivec2 texcoord_min(texcoord);
+                glm::vec2 weights = texcoord - glm::vec2(texcoord_min);
+                int texWidth = p[index].v[0].texWidth;
+                glm::u8vec3* tex_colors = (glm::u8vec3*)p[index].v[0].dev_diffuseTex;
+                glm::vec3 col = glm::vec3(tex_colors[texcoord_min.x + texcoord_min.y * texWidth])/glm::vec3(255.0)*weights.x*weights.y +
+                                glm::vec3(tex_colors[texcoord_min.x + (texcoord_min.y + 1) * texWidth])/glm::vec3(255.0)*weights.x*(1-weights.y) +
+                                glm::vec3(tex_colors[texcoord_min.x + 1 + texcoord_min.y * texWidth])/glm::vec3(255.0)*(1-weights.x)*weights.y +
+                                glm::vec3(tex_colors[texcoord_min.x + 1 + (texcoord_min.y + 1) * texWidth])/glm::vec3(255.0)*(1-weights.x)*(1-weights.y);
+                fragments[fid].color = col;
+              }
+            }
+            /*mutex = 0;
+          }
+        } while (!is_set);*/
+      }
+    }
+
+  }
+}
+
+__global__ void rasterizeTri(int width, int height, int start_x, int start_y,
+    int index, Primitive *primitives, Fragment *fragments, float *dev_depth,
+    int *dev_depth_mutex) {
+  extern __shared__ Primitive p[];
+  int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    *p = primitives[index];
+  }
+  __syncthreads();
+  if (x < width && y < height) {
+    glm::vec3 tri[3];
+    convertToPixels(p, tri, width, height);
+    tri[0].z = 1 / tri[0].z; tri[1].z = 1 / tri[1].z; tri[2].z = 1 / tri[2].z;
+    glm::vec2 test_point = glm::vec2(start_x + x, start_y + y);
+    if (test_point.x >= width || test_point.x < 0 || test_point.y >= height || test_point.y < 0);
+    else {
+      glm::vec3 bary = calculateBarycentricCoordinate(tri, test_point);
+      if (isBarycentricCoordInBounds(bary)) {
+        int fid = test_point.x + test_point.y * width;
+        int *mutex = &dev_depth_mutex[fid];
+        bool is_set;
+        do {
+          is_set = (atomicCAS(mutex, 0, 1) == 0);
+          if (is_set) {
+            float depth = 1/getZAtCoordinate(bary, tri);
+            if (depth < dev_depth[fid]) {
+              Fragment *f = &fragments[fid];
+              dev_depth[fid] = depth;
+              InterpolateTri(p, bary, f, tri, depth);
+              if (p->v[0].dev_diffuseTex == NULL) {
+                fragments[fid].color = p->v[0].col;
+              } else {
+                glm::vec2 texcoord = f->texcoord0*glm::vec2(p->v[0].texWidth, p->v[0].texHeight);
+                glm::ivec2 texcoord_min(texcoord);
+                glm::vec2 weights = texcoord - glm::vec2(texcoord_min);
+                int texWidth = p->v[0].texWidth;
+                glm::u8vec3* tex_colors = (glm::u8vec3*)p->v[0].dev_diffuseTex;
+                glm::vec3 col = glm::vec3(tex_colors[texcoord_min.x + texcoord_min.y * texWidth])/glm::vec3(255.0)*weights.x*weights.y +
+                                glm::vec3(tex_colors[texcoord_min.x + (texcoord_min.y + 1) * texWidth])/glm::vec3(255.0)*weights.x*(1-weights.y) +
+                                glm::vec3(tex_colors[texcoord_min.x + 1 + texcoord_min.y * texWidth])/glm::vec3(255.0)*(1-weights.x)*weights.y +
+                                glm::vec3(tex_colors[texcoord_min.x + 1 + (texcoord_min.y + 1) * texWidth])/glm::vec3(255.0)*(1-weights.x)*(1-weights.y);
+                fragments[fid].color = col;
+              }
+            }
+            *mutex = 0;
+          }
+        } while (!is_set);
+      }
+    }
+
+  }
+}
+
+__global__ void computeAABBs(int numPrimitives, Primitive *p, AABB *aabbs,
+                             int width, int height) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < numPrimitives) {
+    glm::vec3 tri[3];
+    convertToPixels(&p[index], tri, width, height);
+    aabbs[index] = getAABBForTriangle(tri);
+  }
+}
+
+/*__global__ void rasterize(int numPrimitives, int height, int width,
+                          Primitive* p, Fragment *fragments) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < numPrimitives) {
+    glm::vec3 tri[3] = {glm::vec3(p[index].v[0].pos), 
+                        glm::vec3(p[index].v[1].pos),
+                        glm::vec3(p[index].v[2].pos)};
+    AABB aabb = getAABBForTriangle(tri);
+    raterizeTri<<<
+  }
+}*/
 
 
 /**
  * Perform rasterization.
  */
 void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const glm::mat3 MV_normal) {
-    int sideLength2d = 8;
-    dim3 blockSize2d(sideLength2d, sideLength2d);
-    dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
-		(height - 1) / blockSize2d.y + 1);
+  int sideLength2d = 16;
+  dim3 blockSize2d(sideLength2d, sideLength2d);
+  dim3 blockCount2d((width  - 1) / blockSize2d.x + 1,
+                    (height - 1) / blockSize2d.y + 1);
 
 	// Execute your rasterization pipeline here
 	// (See README for rasterization pipeline outline.)
@@ -702,10 +921,10 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 				dim3 numBlocksForVertices((p->numVertices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 				dim3 numBlocksForIndices((p->numIndices + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x);
 
-				_vertexTransformAndAssembly << < numBlocksForVertices, numThreadsPerBlock >> >(p->numVertices, *p, MVP, MV, MV_normal, width, height);
+				_vertexTransformAndAssembly<<<numBlocksForVertices, numThreadsPerBlock>>>(p->numVertices, *p, MVP, MV, MV_normal, width, height);
 				checkCUDAError("Vertex Processing");
 				cudaDeviceSynchronize();
-				_primitiveAssembly << < numBlocksForIndices, numThreadsPerBlock >> >
+				_primitiveAssembly<<<numBlocksForIndices, numThreadsPerBlock>>>
 					(p->numIndices, 
 					curPrimitiveBeginId, 
 					dev_primitives, 
@@ -720,18 +939,55 @@ void rasterize(uchar4 *pbo, const glm::mat4 & MVP, const glm::mat4 & MV, const g
 	}
 	
 	cudaMemset(dev_fragmentBuffer, 0, width * height * sizeof(Fragment));
-	initDepth << <blockCount2d, blockSize2d >> >(width, height, dev_depth);
+	//initDepth<<<blockCount2d, blockSize2d>>>(width, height, dev_depth);
+	initDepth<<<blockCount2d, blockSize2d>>>(width, height, dev_depth_f);
+	cudaMemset(dev_depth_mutex, 0, width * height * sizeof(int));
 	
 	// TODO: rasterize
+  dim3 numThreadsPerBlock = (128);
+  dim3 numBlocksForPrimitives = (totalNumPrimitives + numThreadsPerBlock.x - 1) / numThreadsPerBlock.x;
+#if DOTS
+  rasterize_points<<<numBlocksForPrimitives, numThreadsPerBlock>>>(totalNumPrimitives, width, height, dev_primitives, dev_fragmentBuffer);
+#else
+  computeAABBs<<<numBlocksForPrimitives, numThreadsPerBlock>>>(totalNumPrimitives, dev_primitives, dev_aabbs, width, height);
+	checkCUDAError("aabbs");
+  AABB *aabbs = (AABB *)malloc(sizeof(AABB) * totalNumPrimitives);
+  cudaMemcpy(aabbs, dev_aabbs, sizeof(AABB) * totalNumPrimitives, cudaMemcpyDeviceToHost);
+  cudaStream_t streams[16];
+  for (int i = 0; i < 16; ++i) {
+    cudaStreamCreate(&streams[i]);
+  }
+  for (int i = 0; i < totalNumPrimitives; ++i) {
+    glm::vec3 max = aabbs[i].max;
+    glm::vec3 min = aabbs[i].min;
+    //if (min.x > 800 || min.y > 800 || min.z > 0 || max.x < 0 || max.y < 0 || max.z < -1) continue;
+    if (min.x > 800 || min.y > 800 || max.x < 0 || max.y < 0) continue;
+    max = glm::min(max, glm::vec3(800, 800, 10));
+    min = glm::max(min, glm::vec3(0, 0, -10));
+    dim3 blockCountForRast((max.x - min.x + blockSize2d.x - 1) / blockSize2d.x + 1,
+                           (max.y - min.y + blockSize2d.x - 1) / blockSize2d.y + 1);
+    rasterizeTri<<<blockCountForRast, blockSize2d, sizeof(Primitive), streams[i%16]>>>
+        (width, height, (int)min.x, (int)min.y, i, dev_primitives,
+         dev_fragmentBuffer, dev_depth_f , dev_depth_mutex);
+    /*cudaDeviceSynchronize();
+    cudaError_t err = cudaGetLastError();
+    if (cudaSuccess != err) {
+      printf("tri error: %d (%d, %d)\n", i, blockCountForRast.x, blockCountForRast.y);
+    }
+    checkCUDAError("tri");*/
+  }
+#endif
+  cudaDeviceSynchronize();
+	checkCUDAError("rasterize");
 
 
 
-    // Copy depthbuffer colors into framebuffer
-	render << <blockCount2d, blockSize2d >> >(width, height, dev_fragmentBuffer, dev_framebuffer);
+  // Copy depthbuffer colors into framebuffer
+	render<<<blockCount2d, blockSize2d>>>(width, height, dev_fragmentBuffer, dev_framebuffer);
 	checkCUDAError("fragment shader");
-    // Copy framebuffer into OpenGL buffer for OpenGL previewing
-    sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
-    checkCUDAError("copy render result to pbo");
+  // Copy framebuffer into OpenGL buffer for OpenGL previewing
+  sendImageToPBO<<<blockCount2d, blockSize2d>>>(pbo, width, height, dev_framebuffer);
+  checkCUDAError("copy render result to pbo");
 }
 
 /**
@@ -760,17 +1016,26 @@ void rasterizeFree() {
 
 	////////////
 
-    cudaFree(dev_primitives);
-    dev_primitives = NULL;
+  cudaFree(dev_primitives);
+  dev_primitives = NULL;
+
+  cudaFree(dev_aabbs);
+  dev_aabbs = NULL;
 
 	cudaFree(dev_fragmentBuffer);
 	dev_fragmentBuffer = NULL;
 
-    cudaFree(dev_framebuffer);
-    dev_framebuffer = NULL;
+  cudaFree(dev_framebuffer);
+  dev_framebuffer = NULL;
 
 	cudaFree(dev_depth);
 	dev_depth = NULL;
 
-    checkCUDAError("rasterize Free");
+  cudaFree(dev_depth_f);
+  dev_depth_f = NULL;
+
+  cudaFree(dev_depth_mutex);
+  dev_depth_mutex = NULL;
+
+  checkCUDAError("rasterize Free");
 }