diff --git a/README.md b/README.md
index 98dd9a8..6a45c0f 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,35 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Sarah Forcier
+* Tested on: GeForce GTX 1070
 
-### (TODO: Your README)
+| 100,000 boids | 5,000 boids | 
+| ------------- | ----------- |
+| ![](flocking.gif) | ![](flocking1.gif) |
+| 720 FPS | 620 FPS |
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Performance Analysis
+
+#### Framerate change with increasing number of boids
+![](typecomparison.png)
+
+#### Framerate change with and without visualization
+![](visualization.png)
+
+#### Framerate change with increasing block size
+![](blocksize.png)
+
+### Q&A
+
+#### How does changing the number of boids affect performance? Why?
+Generally, increasing the number of boids slows performance. However, the brute force method performs comparably with the grid-accelerated method for lower boid counts because iterating through all the boids can be completed faster than the necessary set up required for sorting the boids with respect to the grid.
+
+#### How does changing the block count and block size affect performance? Why?
+The performance increases with block size, but plateaus at 32 because the GPU architecture has a warp size of 32. Smaller block sizes do not take advantage of the full warp, but larger sizes that are multiples of 32 cannot get better performance because they have already use the full warp. However, for blocksizes that are not multiples of 32, performance is negatively affected because some warps will not be completely filled. 
+
+#### For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+A speed up was achieved by using a coherent uniform grid. This is the expected behavior because it reduces the number of cache misses that occur when the position and velocity is looked up. In the uniform grid implementation, the boid pointers are sorted in grid order but these pointers point to uncontiguous memory. The coherent uniform grid sorts the velocity and position data directly so that these values are contiguous.  
+
+#### Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not?
+Decreasing the cell width to the neighborhood distance, and checking 27 neighboring cells decreased the performace by a factor of 7 for the coherent and uniform grid methods. The brute force method is not affected by this change. The simulation is much slower because there are simply more cells to check during each step, and with more cells to check, there is a greater probability of cache misses. This test was run with 10,000 boids because the FPS is zero for simulations with 50,000 boids and greater.   
\ No newline at end of file
diff --git a/blocksize.png b/blocksize.png
new file mode 100644
index 0000000..6aa044e
Binary files /dev/null and b/blocksize.png differ
diff --git a/flocking.gif b/flocking.gif
new file mode 100644
index 0000000..ad5607c
Binary files /dev/null and b/flocking.gif differ
diff --git a/flocking1.gif b/flocking1.gif
new file mode 100644
index 0000000..33b1611
Binary files /dev/null and b/flocking1.gif differ
diff --git a/performance.xlsx b/performance.xlsx
new file mode 100644
index 0000000..ccbaf85
Binary files /dev/null and b/performance.xlsx differ
diff --git a/src/kernel.cu b/src/kernel.cu
index aaf0fbf..51e088e 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -21,14 +21,14 @@
 * Check for CUDA errors; print and exit if there was a problem.
 */
 void checkCUDAError(const char *msg, int line = -1) {
-  cudaError_t err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    if (line >= 0) {
-      fprintf(stderr, "Line %d: ", line);
-    }
-    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess != err) {
+		if (line >= 0) {
+			fprintf(stderr, "Line %d: ", line);
+		}
+		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
+		exit(EXIT_FAILURE);
+	}
 }
 
 
@@ -42,7 +42,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
 #define rule1Distance 5.0f
-#define rule2Distance 3.0f
+#define rule2Distance 1.0f
 #define rule3Distance 5.0f
 
 #define rule1Scale 0.01f
@@ -66,7 +66,8 @@ dim3 threadsPerBlock(blockSize);
 // Consider why you would need two velocity buffers in a simulation where each
 // boid cares about its neighbors' velocities.
 // These are called ping-pong buffers.
-glm::vec3 *dev_pos;
+glm::vec3 *dev_pos1;
+glm::vec3 *dev_pos2;
 glm::vec3 *dev_vel1;
 glm::vec3 *dev_vel2;
 
@@ -76,20 +77,20 @@ glm::vec3 *dev_vel2;
 // For efficient sorting and the uniform grid. These should always be parallel.
 int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle?
 int *dev_particleGridIndices; // What grid cell is this particle in?
-// needed for use with thrust
+							  // needed for use with thrust
 thrust::device_ptr<int> dev_thrust_particleArrayIndices;
 thrust::device_ptr<int> dev_thrust_particleGridIndices;
 
 int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
 int *dev_gridCellEndIndices;   // to this cell?
 
-// TODO-2.3 - consider what additional buffers you might need to reshuffle
-// the position and velocity data to be coherent within cells.
+							   // TODO-2.3 - consider what additional buffers you might need to reshuffle
+							   // the position and velocity data to be coherent within cells.
 
-// LOOK-2.1 - Grid parameters based on simulation parameters.
-// These are automatically computed for you in Boids::initSimulation
+							   // LOOK-2.1 - Grid parameters based on simulation parameters.
+							   // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
-int gridSideCount;
+int gridResolution;
 float gridCellWidth;
 float gridInverseCellWidth;
 glm::vec3 gridMinimum;
@@ -99,13 +100,13 @@ glm::vec3 gridMinimum;
 ******************/
 
 __host__ __device__ unsigned int hash(unsigned int a) {
-  a = (a + 0x7ed55d16) + (a << 12);
-  a = (a ^ 0xc761c23c) ^ (a >> 19);
-  a = (a + 0x165667b1) + (a << 5);
-  a = (a + 0xd3a2646c) ^ (a << 9);
-  a = (a + 0xfd7046c5) + (a << 3);
-  a = (a ^ 0xb55a4f09) ^ (a >> 16);
-  return a;
+	a = (a + 0x7ed55d16) + (a << 12);
+	a = (a ^ 0xc761c23c) ^ (a >> 19);
+	a = (a + 0x165667b1) + (a << 5);
+	a = (a + 0xd3a2646c) ^ (a << 9);
+	a = (a + 0xfd7046c5) + (a << 3);
+	a = (a ^ 0xb55a4f09) ^ (a >> 16);
+	return a;
 }
 
 /**
@@ -113,10 +114,10 @@ __host__ __device__ unsigned int hash(unsigned int a) {
 * Function for generating a random vec3.
 */
 __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) {
-  thrust::default_random_engine rng(hash((int)(index * time)));
-  thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
+	thrust::default_random_engine rng(hash((int)(index * time)));
+	thrust::uniform_real_distribution<float> unitDistrib(-1, 1);
 
-  return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
+	return glm::vec3((float)unitDistrib(rng), (float)unitDistrib(rng), (float)unitDistrib(rng));
 }
 
 /**
@@ -124,52 +125,66 @@ __host__ __device__ glm::vec3 generateRandomVec3(float time, int index) {
 * CUDA kernel for generating boids with a specified mass randomly around the star.
 */
 __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, float scale) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    glm::vec3 rand = generateRandomVec3(time, index);
-    arr[index].x = scale * rand.x;
-    arr[index].y = scale * rand.y;
-    arr[index].z = scale * rand.z;
-  }
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+		glm::vec3 rand = generateRandomVec3(time, index);
+		arr[index].x = scale * rand.x;
+		arr[index].y = scale * rand.y;
+		arr[index].z = scale * rand.z;
+	}
 }
 
 /**
 * Initialize memory, update some globals
 */
 void Boids::initSimulation(int N) {
-  numObjects = N;
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
-  // LOOK-1.2 - This is basic CUDA memory management and error checking.
-  // Don't forget to cudaFree in  Boids::endSimulation.
-  cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
-
-  cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
-
-  cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
-  checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
-
-  // LOOK-1.2 - This is a typical CUDA kernel invocation.
-  kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
-    dev_pos, scene_scale);
-  checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
-
-  // LOOK-2.1 computing grid params
-  gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
-  int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
-  gridSideCount = 2 * halfSideCount;
-
-  gridCellCount = gridSideCount * gridSideCount * gridSideCount;
-  gridInverseCellWidth = 1.0f / gridCellWidth;
-  float halfGridWidth = gridCellWidth * halfSideCount;
-  gridMinimum.x -= halfGridWidth;
-  gridMinimum.y -= halfGridWidth;
-  gridMinimum.z -= halfGridWidth;
-
-  // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
-  cudaThreadSynchronize();
+	numObjects = N;
+	dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+	// LOOK-1.2 - This is basic CUDA memory management and error checking.
+	// Don't forget to cudaFree in  Boids::endSimulation.
+	cudaMalloc((void**)&dev_pos1, N * sizeof(glm::vec3));
+	checkCUDAErrorWithLine("cudaMalloc dev_pos1 failed!");
+
+	cudaMalloc((void**)&dev_pos2, N * sizeof(glm::vec3));
+	checkCUDAErrorWithLine("cudaMalloc dev_pos2 failed!");
+	
+	cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
+	checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
+
+	cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
+	checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
+
+	// LOOK-1.2 - This is a typical CUDA kernel invocation.
+	kernGenerateRandomPosArray << <fullBlocksPerGrid, blockSize >> >(1, numObjects, dev_pos1, scene_scale);
+	checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
+
+	// LOOK-2.1 computing grid params
+	gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
+	int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
+	gridResolution = 2 * halfSideCount;
+
+	gridCellCount = gridResolution * gridResolution * gridResolution;
+	gridInverseCellWidth = 1.0f / gridCellWidth;
+	float halfGridWidth = gridCellWidth * halfSideCount;
+	gridMinimum.x -= halfGridWidth;
+	gridMinimum.y -= halfGridWidth;
+	gridMinimum.z -= halfGridWidth;
+
+	// TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+	cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+	cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+	cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+	cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+	cudaThreadSynchronize();
 }
 
 
@@ -181,41 +196,41 @@ void Boids::initSimulation(int N) {
 * Copy the boid positions into the VBO so that they can be drawn by OpenGL.
 */
 __global__ void kernCopyPositionsToVBO(int N, glm::vec3 *pos, float *vbo, float s_scale) {
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
-  float c_scale = -1.0f / s_scale;
+	float c_scale = -1.0f / s_scale;
 
-  if (index < N) {
-    vbo[4 * index + 0] = pos[index].x * c_scale;
-    vbo[4 * index + 1] = pos[index].y * c_scale;
-    vbo[4 * index + 2] = pos[index].z * c_scale;
-    vbo[4 * index + 3] = 1.0f;
-  }
+	if (index < N) {
+		vbo[4 * index + 0] = pos[index].x * c_scale;
+		vbo[4 * index + 1] = pos[index].y * c_scale;
+		vbo[4 * index + 2] = pos[index].z * c_scale;
+		vbo[4 * index + 3] = 1.0f;
+	}
 }
 
 __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float s_scale) {
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
-
-  if (index < N) {
-    vbo[4 * index + 0] = vel[index].x + 0.3f;
-    vbo[4 * index + 1] = vel[index].y + 0.3f;
-    vbo[4 * index + 2] = vel[index].z + 0.3f;
-    vbo[4 * index + 3] = 1.0f;
-  }
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+	if (index < N) {
+		vbo[4 * index + 0] = vel[index].x + 0.3f;
+		vbo[4 * index + 1] = vel[index].y + 0.3f;
+		vbo[4 * index + 2] = vel[index].z + 0.3f;
+		vbo[4 * index + 3] = 1.0f;
+	}
 }
 
 /**
 * Wrapper for call to the kernCopyboidsToVBO CUDA kernel.
 */
 void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) {
-  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
 
-  kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, vbodptr_positions, scene_scale);
-  kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
+	kernCopyPositionsToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos1, vbodptr_positions, scene_scale);
+	kernCopyVelocitiesToVBO << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_vel1, vbodptr_velocities, scene_scale);
 
-  checkCUDAErrorWithLine("copyBoidsToVBO failed!");
+	checkCUDAErrorWithLine("copyBoidsToVBO failed!");
 
-  cudaThreadSynchronize();
+	cudaThreadSynchronize();
 }
 
 
@@ -230,21 +245,54 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+
+	float count1 = 0.f; float count3 = 0.f;
+	glm::vec3 spos = pos[iSelf];
+	glm::vec3 v1(0.f), v2(0.f), v3(0.f);
+
+	for (int i = 0; i < N; ++i) {
+		// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+		if (i != iSelf) {
+			glm::vec3 bpos = pos[i];
+			float dist = glm::distance(spos, bpos);
+			if (dist < rule1Distance) {
+				v1 += bpos;
+				count1++;
+			}
+			// Rule 2: boids try to stay a distance d away from each other
+			if (dist < rule2Distance) {
+				v2 -= (bpos - spos);
+			}
+			// Rule 3: boids try to match the speed of surrounding boids
+			if (dist < rule3Distance) {
+				v3 += vel[i];
+				count3++;
+			}
+		}
+	}
+
+	v1 = (count1 > 0) ? v1 / count1 - spos : glm::vec3(0.f);
+	v3 = (count3 > 0) ? v3 / count3 : glm::vec3(0.f);
+	return v1*rule1Scale + v2*rule2Scale + v3*rule3Scale;
 }
 
 /**
 * TODO-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
 */
-__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2)
+{
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+
+	// Compute a new velocity based on pos and vel1
+	glm::vec3 vel = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+
+	// Clamp the speed
+	vel = (glm::length(vel) > maxSpeed) ? glm::normalize(vel) * maxSpeed : vel;
+
+	// Record the new velocity into vel2. Question: why NOT vel1? read / write
+	vel2[index] = vel;
 }
 
 /**
@@ -252,208 +300,472 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
 * For each of the `N` bodies, update its position based on its current velocity.
 */
 __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
-  // Update position by velocity
-  int index = threadIdx.x + (blockIdx.x * blockDim.x);
-  if (index >= N) {
-    return;
-  }
-  glm::vec3 thisPos = pos[index];
-  thisPos += vel[index] * dt;
-
-  // Wrap the boids around so we don't lose them
-  thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
-  thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
-  thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
-
-  thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
-  thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
-  thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
-
-  pos[index] = thisPos;
+	// Update position by velocity
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+	glm::vec3 thisPos = pos[index];
+	thisPos += vel[index] * dt;
+
+	// Wrap the boids around so we don't lose them
+	thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
+	thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
+	thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
+
+	thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
+	thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
+	thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
+
+	pos[index] = thisPos;
+}
+
+__global__ void kernUpdateCoherentPos(int N, float dt, glm::vec3 *in, glm::vec3 *out, glm::vec3 *vel) {
+	// Update position by velocity
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+
+	glm::vec3 thisPos = in[index];
+	thisPos += vel[index] * dt;
+
+	// Wrap the boids around so we don't lose them
+	thisPos.x = thisPos.x < -scene_scale ? scene_scale : thisPos.x;
+	thisPos.y = thisPos.y < -scene_scale ? scene_scale : thisPos.y;
+	thisPos.z = thisPos.z < -scene_scale ? scene_scale : thisPos.z;
+
+	thisPos.x = thisPos.x > scene_scale ? -scene_scale : thisPos.x;
+	thisPos.y = thisPos.y > scene_scale ? -scene_scale : thisPos.y;
+	thisPos.z = thisPos.z > scene_scale ? -scene_scale : thisPos.z;
+
+	out[index] = thisPos;
+	
 }
 
 // LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index.
 // LOOK-2.3 Looking at this method, what would be the most memory efficient
 //          order for iterating over neighboring grid cells?
-//          for(x)
+//          for(z)
 //            for(y)
-//             for(z)? Or some other order?
+//              for(x)
 __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
-  return x + y * gridResolution + z * gridResolution * gridResolution;
+	return x + y * gridResolution + z * gridResolution * gridResolution;
 }
 
-__global__ void kernComputeIndices(int N, int gridResolution,
-  glm::vec3 gridMin, float inverseCellWidth,
-  glm::vec3 *pos, int *indices, int *gridIndices) {
-    // TODO-2.1
-    // - Label each boid with the index of its grid cell.
-    // - Set up a parallel array of integer indices as pointers to the actual
-    //   boid data in pos and vel1/vel2
+__global__ void kernComputeIndices(int N, int gridRes,
+	glm::vec3 gridMin, float inverseCellWidth,
+	glm::vec3 *pos, int *indices, int *gridIndices)
+{
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+	//printf("%f, %f, %f \n", pos[index].x, pos[index].y, pos[index].z);
+
+	// TODO-2.1
+	// - Label each boid with the index of its grid cell (dev_particleGridIndices)
+	glm::ivec3 boid_pos = (pos[index] - gridMin) * inverseCellWidth;
+	gridIndices[index] = gridIndex3Dto1D(boid_pos.x, boid_pos.y, boid_pos.z, gridRes);
+	// - Set up a parallel array of integer indices as pointers to the actual
+	//   boid data in pos and vel1/vel2 (dev_particleArrayIndices)
+	indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
 //          does not enclose any boids
 __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    intBuffer[index] = value;
-  }
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+		intBuffer[index] = value;
+	}
 }
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
-  int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
-  // Identify the start point of each cell in the gridIndices array.
-  // This is basically a parallel unrolling of a loop that goes
-  // "this index doesn't match the one before it, must be a new cell!"
+	int *gridCellStartIndices, int *gridCellEndIndices) {
+	// TODO-2.1
+	// Identify the start and end point of each cell in the gridIndices array.
+	// This is basically a parallel unrolling of a loop that goes
+	// "this index doesn't match the one before it, must be a new cell!"
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+
+	int curr = particleGridIndices[index];
+
+	if (index == 0) {
+		gridCellStartIndices[curr] = index;
+		return;
+	}
+
+	int prev = particleGridIndices[index - 1];
+
+	if (prev != curr) {
+		gridCellStartIndices[curr] = index;
+		gridCellEndIndices[prev] = index; // not inclusive
+	}
+	if (index == N - 1) {
+		gridCellEndIndices[curr] = index + 1; // not inclusive
+		return;
+	}
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
-  // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	int N, int gridRes, glm::vec3 gridMin,
+	float inverseCellWidth, float cellWidth,
+	int *gridCellStartIndices, int *gridCellEndIndices,
+	int *particleArrayIndices,
+	glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2)
+{
+	// TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
+	// the number of boids that need to be checked.
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+
+	// - Identify the grid cell that this particle is in
+	int partIdx = particleArrayIndices[index];
+	glm::vec3 ppos = pos[partIdx];
+	glm::ivec3 partIdx3D = (ppos - gridMin) * inverseCellWidth;
+
+	// - Identify which cells may contain neighbors. This isn't always 8.
+	int neighbors[8];
+	//int mask[8];
+	int numNeighbors = 0;
+
+	int nextZ = (ppos.z - cellWidth * partIdx3D.z - gridMin.z < cellWidth/2) ? -1 : 1;
+	for (int k = 0; k < 2; k ++) {
+		int z = partIdx3D.z + nextZ*k;
+		if (z < 0 && z >= gridRes) break;
+		int nextY = (ppos.y - cellWidth * partIdx3D.y - gridMin.y < cellWidth / 2) ? -1 : 1;
+		for (int j = 0; j < 2; j ++) {
+			int y = partIdx3D.y + nextY*j;
+			if (y < 0 && y >= gridRes) break;
+			int nextX = (ppos.x - cellWidth * partIdx3D.x - gridMin.x < cellWidth / 2) ? -1 : 1;
+			for (int i = 0; i < 2; i ++) {
+				int x = partIdx3D.x + nextX*i;
+				if (x < 0 && x >= gridRes) break;
+				neighbors[numNeighbors] = gridIndex3Dto1D(x, y, z, gridRes);
+				numNeighbors++;
+			}
+		}
+	}
+
+	float count1 = 0.f; float count3 = 0.f;
+	glm::vec3 v1(0.f), v2(0.f), v3(0.f);
+
+	for (int i = 0; i < numNeighbors; ++i) {
+		// - For each cell, read the start/end indices in the boid pointer array.
+		int startIdx = gridCellStartIndices[neighbors[i]];
+		int endIdx = gridCellEndIndices[neighbors[i]];
+
+		// - Access each boid in the cell and compute velocity change from
+		//   the boids rules, if this boid is within the neighborhood distance.
+		for (int j = startIdx; j < endIdx; ++j) {
+			if (j != index) {
+				int boidIdx = particleArrayIndices[j];
+				glm::vec3 bpos = pos[boidIdx];
+				float dist = glm::distance(ppos, bpos);
+				// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+				if (dist < rule1Distance) {
+					v1 += bpos;
+					count1++;
+				}
+				// Rule 2: boids try to stay a distance d away from each other
+				if (dist < rule2Distance) {
+					v2 -= (bpos - ppos);
+				}
+				// Rule 3: boids try to match the speed of surrounding boids
+				if (dist < rule3Distance) {
+					v3 += vel1[boidIdx];
+					count3++;
+				}
+			}
+		}
+	}
+
+	v1 = (count1 > 0) ? v1 / count1 - ppos : glm::vec3(0.f);
+	v3 = (count3 > 0) ? v3 / count3 : glm::vec3(0.f);
+	glm::vec3 vel = vel1[partIdx] + v1*rule1Scale + v2*rule2Scale + v3*rule3Scale;
+
+	// Clamp the speed
+	vel = (glm::length(vel) > maxSpeed) ? glm::normalize(vel) * maxSpeed : vel;
+
+	// Record the new velocity into vel2.
+	vel2[partIdx] = vel;
+}
+
+__global__ void kernShuffleV(int N, int *particleArrayIndices, glm::vec3 *new_arr, glm::vec3 *old_arr) {
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+
+	new_arr[index] = old_arr[particleArrayIndices[index]];
+}
+
+__global__ void kernShuffleP(int N, int *particleArrayIndices, glm::vec3 *new_arr, glm::vec3 *old_arr) {
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+
+	new_arr[index] = old_arr[particleArrayIndices[index]];
+	//printf("%d\n", particleArrayIndices[index]);
+
+	//printf("%f, %f, %f \n", old_arr[index].x, old_arr[index].y, old_arr[index].z);
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	int N, int gridRes, glm::vec3 gridMin,
+	float inverseCellWidth, float cellWidth,
+	int *gridCellStartIndices, int *gridCellEndIndices,
+	glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) 
+{
+	// TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+	// except with one less level of indirection.
+	// This should expect gridCellStartIndices and gridCellEndIndices to refer
+	// directly to pos and vel1.
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) return;
+
+	// - Identify the grid cell that this particle is in
+	glm::vec3 ppos = pos[index];
+	glm::ivec3 partIdx3D = (ppos - gridMin) * inverseCellWidth;
+
+	// - Identify which cells may contain neighbors. This isn't always 8.
+	int neighbors[8];
+	int numNeighbors = 0;
+
+	int nextZ = (ppos.z - cellWidth * partIdx3D.z - gridMin.z < cellWidth/2) ? -1 : 1;
+	for (int k = 0; k < 2; k++) {
+		int z = partIdx3D.z + nextZ*k;
+		if (z < 0 && z >= gridRes) break;
+		int nextY = (ppos.y - cellWidth * partIdx3D.y - gridMin.y < cellWidth/ 2) ? -1 : 1;
+		for (int j = 0; j < 2; j++) {
+			int y = partIdx3D.y + nextY*j;
+			if (y < 0 && y >= gridRes) break;
+			int nextX = (ppos.x - cellWidth * partIdx3D.x - gridMin.x < cellWidth / 2) ? -1 : 1;
+			for (int i = 0; i < 2; i++) {
+				int x = partIdx3D.x + nextX*i;
+				if (x < 0 && x >= gridRes) break;
+				neighbors[numNeighbors] = gridIndex3Dto1D(x, y, z, gridRes);
+				numNeighbors++;
+			}
+		}
+	}
+
+	float count1 = 0.f; float count3 = 0.f;
+	glm::vec3 v1(0.f), v2(0.f), v3(0.f);
+
+	for (int i = 0; i < numNeighbors; ++i) {
+		// - For each cell, read the start/end indices in the boid pointer array.
+		int startIdx = gridCellStartIndices[neighbors[i]];
+		int endIdx = gridCellEndIndices[neighbors[i]];
+
+		//   DIFFERENCE: For best results, consider what order the cells should be
+		//   checked in to maximize the memory benefits of reordering the boids data.
+
+		// - Access each boid in the cell and compute velocity change from
+		//   the boids rules, if this boid is within the neighborhood distance.
+		for (int boidIdx = startIdx; boidIdx < endIdx; ++boidIdx) {
+			if (boidIdx != index) {
+				glm::vec3 bpos = pos[boidIdx];
+				float dist = glm::distance(ppos, bpos);
+				// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+				if (dist < rule1Distance) {
+					v1 += bpos;
+					count1++;
+				}
+				// Rule 2: boids try to stay a distance d away from each other
+				if (dist < rule2Distance) {
+					v2 -= (bpos - ppos);
+				}
+				// Rule 3: boids try to match the speed of surrounding boids
+				if (dist < rule3Distance) {
+					v3 += vel1[boidIdx];
+					count3++;
+				}
+			}
+		}
+	}
+
+	v1 = (count1 > 0) ? v1 / count1 - ppos : glm::vec3(0.f);
+	v3 = (count3 > 0) ? v3 / count3 : glm::vec3(0.f);
+	glm::vec3 vel = vel1[index] + v1*rule1Scale + v2*rule2Scale + v3*rule3Scale;
+
+	// Clamp the speed
+	vel = (glm::length(vel) > maxSpeed) ? glm::normalize(vel) * maxSpeed : vel;
+
+	// Record the new velocity into vel2.
+	vel2[index] = vel;
 }
 
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+	dim3 blocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	// TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+	kernUpdateVelocityBruteForce << <blocksPerGrid, blockSize >> >(numObjects, dev_pos1, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+	kernUpdatePos << <blocksPerGrid, blockSize >> >(numObjects, dt, dev_pos1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+	// TODO-1.2 ping-pong the velocity buffers (read/write)
+	std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
-  // Uniform Grid Neighbor search using Thrust sort.
-  // In Parallel:
-  // - label each particle with its array index as well as its grid index.
-  //   Use 2x width grids.
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed
+	// TODO-2.1
+	// Uniform Grid Neighbor search using Thrust sort.
+	dim3 blocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	dim3 blocksPerGridCell((gridCellCount + blockSize - 1) / blockSize);
+	// In Parallel:
+	// - label each particle with its array index as well as its grid index.
+	//   Use 2x width grids.
+	kernComputeIndices << <blocksPerGrid, blockSize >> >(numObjects, gridResolution, gridMinimum, gridInverseCellWidth, dev_pos1, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+	// - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+	//   are welcome to do a performance comparison.
+	thrust::device_ptr<int> dev_thrust_grid(dev_particleGridIndices);
+	thrust::device_ptr<int> dev_thrust_particles(dev_particleArrayIndices);
+	thrust::sort_by_key(dev_thrust_grid, dev_thrust_grid + numObjects, dev_thrust_particles);
+
+	// - Naively unroll the loop for finding the start and end indices of each
+	//   cell's data pointers in the array of boid indices
+	kernResetIntBuffer << <blocksPerGridCell, blockSize >> >(gridCellCount, dev_gridCellStartIndices, -1);
+	kernResetIntBuffer << <blocksPerGridCell, blockSize >> >(gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+	kernIdentifyCellStartEnd << <blocksPerGrid, blockSize >> >(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+	// - Perform velocity updates using neighbor search
+	kernUpdateVelNeighborSearchScattered << <blocksPerGrid, blockSize >> >(numObjects, gridResolution, gridMinimum, gridInverseCellWidth, gridCellWidth,
+		dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos1, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
+	// - Update positions
+	kernUpdatePos << <blocksPerGrid, blockSize >> >(numObjects, dt, dev_pos1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+	// - Ping-pong buffers as needed
+	std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
-  // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
-  // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
-  // In Parallel:
-  // - Label each particle with its array index as well as its grid index.
-  //   Use 2x width grids
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
-  //   the particle data in the simulation array.
-  //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+	// TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
+	// Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
+	dim3 blocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	dim3 blocksPerGridCell((gridCellCount + blockSize - 1) / blockSize);
+
+	// In Parallel:
+	// - label each particle with its array index as well as its grid index.
+	//   Use 2x width grids.
+	kernComputeIndices << <blocksPerGrid, blockSize >> >(numObjects, gridResolution, gridMinimum, gridInverseCellWidth, dev_pos1, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+	// - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+	//   are welcome to do a performance comparison.
+	thrust::device_ptr<int> dev_thrust_grid(dev_particleGridIndices);
+	thrust::device_ptr<int> dev_thrust_particles(dev_particleArrayIndices);
+	thrust::sort_by_key(dev_thrust_grid, dev_thrust_grid + numObjects, dev_thrust_particles);
+
+	// - Naively unroll the loop for finding the start and end indices of each
+	//   cell's data pointers in the array of boid indices
+	kernResetIntBuffer << <blocksPerGridCell, blockSize >> >(gridCellCount, dev_gridCellStartIndices, -1);
+	kernResetIntBuffer << <blocksPerGridCell, blockSize >> >(gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+	kernIdentifyCellStartEnd << <blocksPerGrid, blockSize >> >(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+	//// - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
+	////   the particle data in the simulation array.
+	////   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED. 
+	kernShuffleV << <blocksPerGrid, blockSize >> >(numObjects, dev_particleArrayIndices, dev_vel2, dev_vel1); // out, in
+	kernShuffleP << <blocksPerGrid, blockSize >> >(numObjects, dev_particleArrayIndices, dev_pos2, dev_pos1);
+	checkCUDAErrorWithLine("kernShuffle failed!");
+
+	// - Perform velocity updates using neighbor search
+	kernUpdateVelNeighborSearchCoherent << <blocksPerGrid, blockSize >> >(numObjects, gridResolution, gridMinimum, gridInverseCellWidth, gridCellWidth,
+		dev_gridCellStartIndices, dev_gridCellEndIndices, dev_pos2, dev_vel2, dev_vel1);
+	checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!");	
+
+	// - Update positions into pos1
+	kernUpdateCoherentPos << <blocksPerGrid, blockSize >> >(numObjects, dt, dev_pos2, dev_pos1, dev_vel1); // in, out
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+	// - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
 }
 
 void Boids::endSimulation() {
-  cudaFree(dev_vel1);
-  cudaFree(dev_vel2);
-  cudaFree(dev_pos);
-
-  // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+	cudaFree(dev_vel1);
+	cudaFree(dev_vel2);
+	cudaFree(dev_pos1);
+
+	// TODO-2.1 TODO-2.3 - Free any additional buffers here.
+	cudaFree(dev_gridCellStartIndices);
+	cudaFree(dev_gridCellEndIndices);
+	cudaFree(dev_particleArrayIndices);
+	cudaFree(dev_particleGridIndices);
 }
 
 void Boids::unitTest() {
-  // LOOK-1.2 Feel free to write additional tests here.
-
-  // test unstable sort
-  int *dev_intKeys;
-  int *dev_intValues;
-  int N = 10;
-
-  int *intKeys = new int[N];
-  int *intValues = new int[N];
-
-  intKeys[0] = 0; intValues[0] = 0;
-  intKeys[1] = 1; intValues[1] = 1;
-  intKeys[2] = 0; intValues[2] = 2;
-  intKeys[3] = 3; intValues[3] = 3;
-  intKeys[4] = 0; intValues[4] = 4;
-  intKeys[5] = 2; intValues[5] = 5;
-  intKeys[6] = 2; intValues[6] = 6;
-  intKeys[7] = 0; intValues[7] = 7;
-  intKeys[8] = 5; intValues[8] = 8;
-  intKeys[9] = 6; intValues[9] = 9;
-
-  cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
-
-  cudaMalloc((void**)&dev_intValues, N * sizeof(int));
-  checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
-
-  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
-
-  std::cout << "before unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // How to copy data to the GPU
-  cudaMemcpy(dev_intKeys, intKeys, sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMemcpy(dev_intValues, intValues, sizeof(int) * N, cudaMemcpyHostToDevice);
-
-  // Wrap device vectors in thrust iterators for use with thrust.
-  thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
-  thrust::device_ptr<int> dev_thrust_values(dev_intValues);
-  // LOOK-2.1 Example for using thrust::sort_by_key
-  thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
-
-  // How to copy data back to the CPU side from the GPU
-  cudaMemcpy(intKeys, dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  cudaMemcpy(intValues, dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  checkCUDAErrorWithLine("memcpy back failed!");
-
-  std::cout << "after unstable sort: " << std::endl;
-  for (int i = 0; i < N; i++) {
-    std::cout << "  key: " << intKeys[i];
-    std::cout << " value: " << intValues[i] << std::endl;
-  }
-
-  // cleanup
-  delete[] intKeys;
-  delete[] intValues;
-  cudaFree(dev_intKeys);
-  cudaFree(dev_intValues);
-  checkCUDAErrorWithLine("cudaFree failed!");
-  return;
-}
+	// LOOK-1.2 Feel free to write additional tests here.
+
+	// test unstable sort
+	int *dev_intKeys;
+	int *dev_intValues;
+	int N = 10;
+
+	int *intKeys = new int[N];
+	int *intValues = new int[N];
+
+	intKeys[0] = 0; intValues[0] = 0;
+	intKeys[1] = 1; intValues[1] = 1;
+	intKeys[2] = 0; intValues[2] = 2;
+	intKeys[3] = 3; intValues[3] = 3;
+	intKeys[4] = 0; intValues[4] = 4;
+	intKeys[5] = 2; intValues[5] = 5;
+	intKeys[6] = 2; intValues[6] = 6;
+	intKeys[7] = 0; intValues[7] = 7;
+	intKeys[8] = 5; intValues[8] = 8;
+	intKeys[9] = 6; intValues[9] = 9;
+
+	cudaMalloc((void**)&dev_intKeys, N * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_intKeys failed!");
+
+	cudaMalloc((void**)&dev_intValues, N * sizeof(int));
+	checkCUDAErrorWithLine("cudaMalloc dev_intValues failed!");
+
+	dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+
+	std::cout << "before unstable sort: " << std::endl;
+	for (int i = 0; i < N; i++) {
+		std::cout << "  key: " << intKeys[i];
+		std::cout << " value: " << intValues[i] << std::endl;
+	}
+
+	// How to copy data to the GPU
+	cudaMemcpy(dev_intKeys, intKeys, sizeof(int) * N, cudaMemcpyHostToDevice);
+	cudaMemcpy(dev_intValues, intValues, sizeof(int) * N, cudaMemcpyHostToDevice);
+
+	// Wrap device vectors in thrust iterators for use with thrust.
+	thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
+	thrust::device_ptr<int> dev_thrust_values(dev_intValues);
+	// LOOK-2.1 Example for using thrust::sort_by_key
+	thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + N, dev_thrust_values);
+
+	// How to copy data back to the CPU side from the GPU
+	cudaMemcpy(intKeys, dev_intKeys, sizeof(int) * N, cudaMemcpyDeviceToHost);
+	cudaMemcpy(intValues, dev_intValues, sizeof(int) * N, cudaMemcpyDeviceToHost);
+	checkCUDAErrorWithLine("memcpy back failed!");
+
+	std::cout << "after unstable sort: " << std::endl;
+	for (int i = 0; i < N; i++) {
+		std::cout << "  key: " << intKeys[i];
+		std::cout << " value: " << intValues[i] << std::endl;
+	}
+
+	// cleanup
+	delete[] intKeys;
+	delete[] intValues;
+	cudaFree(dev_intKeys);
+	cudaFree(dev_intValues);
+	checkCUDAErrorWithLine("cudaFree failed!");
+	return;
+}
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index a29471d..93e9c75 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 1000000;
 const float DT = 0.2f;
 
 /**
@@ -93,6 +93,7 @@ bool init(int argc, char **argv) {
   glfwSetCursorPosCallback(window, mousePositionCallback);
   glfwSetMouseButtonCallback(window, mouseButtonCallback);
 
+  glfwSwapInterval(false);
   glewExperimental = GL_TRUE;
   if (glewInit() != GLEW_OK) {
     return false;
diff --git a/typecomparison.png b/typecomparison.png
new file mode 100644
index 0000000..03c1486
Binary files /dev/null and b/typecomparison.png differ
diff --git a/visualization.png b/visualization.png
new file mode 100644
index 0000000..6a1e644
Binary files /dev/null and b/visualization.png differ