diff --git a/README.md b/README.md
index 98dd9a8..a366d56 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,39 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Ricky Rajani
+* Tested on: Windows 7, i7-6700 @ 3.40GHz 16GB, NVIDIA Quadro K620 (Moore 100C Lab)
 
-### (TODO: Your README)
+**5,000 Boids on Coherent Uniform Grid**
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+![](images/perf-analysis/Shot1.PNG)
+
+**50,000 Boids on Coherent Uniform Grid**
+
+![](images/perf-analysis/Shot2.PNG)
+
+**100,000 Boids on Coherent Uniform Grid**
+
+![](images/perf-analysis/Shot3.PNG)
+
+### Performance Analysis
+
+![](images/perf-analysis/Graph1.PNG)
+
+![](images/perf-analysis/Graph2.PNG)
+
+For each implementation, how does changing the number of boids affect performance?
+
+As the number of boids increased, there was a significant drop in performance for each implementation. However, the scattered uniform grid method and coherent uniform grid method improved performance considerably. Improvement in performance can be attributed to runtime complexity of the search algorithms in each implementation. The uniform grid method decreased the number of boids that were checked during each iteration. On top of that, coherence made memory access significantly faster.
+
+For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid?
+
+By rearranging the boid data such that velocities and positions of boid could be accessed quickly, there was a considerable performance improvement. This method allowed for direct access to boid data that worked around the GPU having to jump around in memory via pointers.
+
+![](images/perf-analysis/Graph3.PNG)
+
+![](images/perf-analysis/Graph4.PNG)
+
+For each implementation, how does changing the block count and block size affect performance?
+
+All implementations show a significant performance hit at low block sizes, specifically 32, and a slight decrease in performance as blocksize increased past 128 towards the max value of 1024. For smaller block sizes, it should be noted that there can only be one warp (32 threads) in a block. This leads to a decrease in shared memory and a need for more blocks. For large block sizes, there is a need to decrease the number of blocks we can have because there is a cap on the number of threads the GPU can handle.
diff --git a/images/perf-analysis/Graph1.PNG b/images/perf-analysis/Graph1.PNG
new file mode 100644
index 0000000..a813444
Binary files /dev/null and b/images/perf-analysis/Graph1.PNG differ
diff --git a/images/perf-analysis/Graph2.PNG b/images/perf-analysis/Graph2.PNG
new file mode 100644
index 0000000..4f14a29
Binary files /dev/null and b/images/perf-analysis/Graph2.PNG differ
diff --git a/images/perf-analysis/Graph3.PNG b/images/perf-analysis/Graph3.PNG
new file mode 100644
index 0000000..216f14f
Binary files /dev/null and b/images/perf-analysis/Graph3.PNG differ
diff --git a/images/perf-analysis/Graph4.PNG b/images/perf-analysis/Graph4.PNG
new file mode 100644
index 0000000..3ec1416
Binary files /dev/null and b/images/perf-analysis/Graph4.PNG differ
diff --git a/images/perf-analysis/Shot1.PNG b/images/perf-analysis/Shot1.PNG
new file mode 100644
index 0000000..e07c2e0
Binary files /dev/null and b/images/perf-analysis/Shot1.PNG differ
diff --git a/images/perf-analysis/Shot2.PNG b/images/perf-analysis/Shot2.PNG
new file mode 100644
index 0000000..c5b774e
Binary files /dev/null and b/images/perf-analysis/Shot2.PNG differ
diff --git a/images/perf-analysis/Shot3.PNG b/images/perf-analysis/Shot3.PNG
new file mode 100644
index 0000000..a3f1cab
Binary files /dev/null and b/images/perf-analysis/Shot3.PNG differ
diff --git a/src/kernel.cu b/src/kernel.cu
index aaf0fbf..80d0eaf 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -85,6 +85,9 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3 *dev_pos_sorted;
+glm::vec3 *dev_vel1_sorted;
+glm::vec3 *dev_vel2_sorted;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -169,6 +172,31 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_pos_sorted, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_pos_sorted failed!");
+
+  cudaMalloc((void**)&dev_vel1_sorted, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_vel1_sorted failed!");
+
+  cudaMalloc((void**)&dev_vel2_sorted, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_vel2_sorted failed!");
+
+  dev_thrust_particleArrayIndices=thrust::device_pointer_cast(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices=thrust::device_pointer_cast(dev_particleGridIndices);
+
+
   cudaThreadSynchronize();
 }
 
@@ -230,10 +258,55 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+	// TODO-1.2
+  glm::vec3 perceived_center = glm::vec3(0.0f);
+  glm::vec3 c = glm::vec3(0.0f);
+  glm::vec3 perceived_velocity = glm::vec3(0.0f);
+
+  glm::vec3 currBoidPos = pos[iSelf];
+  glm::vec3 currBoidVel = vel[iSelf];
+
+  float rule1N = 0.0f;
+  float rule3N = 0.0f;
+  for (int i = 0; i < N; i++) {
+    if (i != iSelf) {
+      glm::vec3 tempBoidPos = pos[i];
+      glm::vec3 tempBoidVel = vel[i];
+      float dist = glm::distance(tempBoidPos, currBoidPos);
+
+      // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+      if (dist < rule1Distance) {
+        perceived_center += tempBoidPos;
+		    rule1N++;
+      }
+
+      // Rule 2: boids try to stay a distance d away from each other
+      if (dist < rule2Distance) {
+		    c -= (tempBoidPos - currBoidPos);
+      }
+
+      // Rule 3: boids try to match the speed of surrounding boids
+      if (dist < rule3Distance) {
+        perceived_velocity += tempBoidVel;
+		    rule3N++;
+      }
+    }
+  }
+
+  if (rule1N > 0) {
+	  perceived_center /= rule1N;
+	  perceived_center = (perceived_center - currBoidPos) * rule1Scale;
+  }
+
+  c *= rule2Scale;
+
+  if (rule3N > 0) {
+	  perceived_velocity /= rule3N;
+	  perceived_velocity *= rule3Scale;
+  }
+
+  glm::vec3 finalVec = currBoidVel + perceived_center + c + perceived_velocity;
+  return finalVec;
 }
 
 /**
@@ -242,9 +315,21 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+
   // Compute a new velocity based on pos and vel1
+  glm::vec3 newVel = computeVelocityChange(N, index, pos, vel1);
+
   // Clamp the speed
+  if (glm::length(newVel) > maxSpeed) {
+	  newVel = glm::normalize(newVel) * maxSpeed;
+  }
+
   // Record the new velocity into vel2. Question: why NOT vel1?
+  vel2[index] = newVel;
 }
 
 /**
@@ -286,9 +371,18 @@ __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
     // TODO-2.1
-    // - Label each boid with the index of its grid cell.
-    // - Set up a parallel array of integer indices as pointers to the actual
-    //   boid data in pos and vel1/vel2
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+
+  // - Label each boid with the index of its grid cell.
+  glm::ivec3 currBoidPos = (pos[index] - gridMin) * inverseCellWidth;
+  gridIndices[index] = gridIndex3Dto1D(currBoidPos.x, currBoidPos.y, currBoidPos.z, gridResolution);
+  
+  // - Set up a parallel array of integer indices as pointers to the actual
+  //   boid data in pos and vel1/vel2
+  indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +400,22 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+  
+  int prev = particleGridIndices[index - 1];
+  int curr = particleGridIndices[index];
+  if (index >= 1) {
+    if (prev != curr) {
+      gridCellEndIndices[prev] = index - 1;
+      gridCellStartIndices[curr] = index;
+    }
+  }
+  else {
+    gridCellStartIndices[curr] = index;
+  }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -317,11 +427,102 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
   // - Identify the grid cell that this particle is in
+
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+  
   // - Identify which cells may contain neighbors. This isn't always 8.
+  glm::ivec3 cellIndex = (pos[index] - gridMin) * inverseCellWidth;
+  cellIndex = cellIndex - 1;
+  cellIndex.x = imax(0, cellIndex.x);
+  cellIndex.y = imax(0, cellIndex.y);
+  cellIndex.z = imax(0, cellIndex.z);
+
+  glm::vec3 perceived_center = glm::vec3(0.0f);
+  glm::vec3 c = glm::vec3(0.0f);
+  glm::vec3 perceived_velocity = glm::vec3(0.0f);
+
+  float rule1N = 0;
+  float rule3N = 0;
+
   // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 2; j++) {
+      for (int k = 0; k < 2; k++) {
+        if (cellIndex.x + i < gridResolution && cellIndex.y + j < gridResolution && cellIndex.z + k < gridResolution) {
+          int gridIndex = gridIndex3Dto1D(cellIndex.x + i, cellIndex.y + j, cellIndex.z + k, gridResolution);
+          int gridStartIndex = gridCellStartIndices[gridIndex];
+          int gridEndIndex = gridCellEndIndices[gridIndex];
+
+          // - Access each boid in the cell and compute velocity change from
+          //   the boids rules, if this boid is within the neighborhood distance.
+          for (int x = gridStartIndex; x <= gridEndIndex; x++) {
+            if (x != index) {
+              int currIndex = particleArrayIndices[x];
+              float dist = glm::distance(pos[currIndex], pos[index]);
+
+              // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+              if (dist < rule1Distance) {
+                perceived_center += pos[currIndex];
+                rule1N++;
+              }
+
+              // Rule 2: boids try to stay a distance d away from each other
+              if (dist < rule2Distance) {
+                c -= (pos[currIndex] - pos[index]);
+              }
+
+              // Rule 3: boids try to match the speed of surrounding boids
+              if (dist < rule3Distance) {
+                perceived_velocity += vel1[currIndex];
+                rule3N++;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  if (rule1N > 0) {
+    perceived_center /= rule1N;
+    perceived_center = (perceived_center - pos[index]) * rule1Scale;
+  }
+
+  c *= rule2Scale;
+
+  if (rule3N > 0) {
+    perceived_velocity /= rule3N;
+    perceived_velocity *= rule3Scale;
+  }
+
+  glm::vec3 newVel = vel1[index] + perceived_center + c + perceived_velocity;
+  
   // - Clamp the speed change before putting the new speed in vel2
+  if (glm::length(newVel) > maxSpeed) {
+    newVel = glm::normalize(newVel) * maxSpeed;
+  }
+
+  vel2[index] = newVel;
+}
+
+__global__ void kernRearrangeDataPointers(int N, glm::vec3 *prev_buffer, glm::vec3 *new_buffer, int *particleArrayIndices) {
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+
+  new_buffer[index] = prev_buffer[particleArrayIndices[index]];
+}
+
+__global__ void kernSwapRearrangedDataPointers(int N, glm::vec3 *prev_buffer, glm::vec3 *new_buffer, int *particleArrayIndices) {
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+
+  prev_buffer[particleArrayIndices[index]] = new_buffer[index];
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -333,14 +534,86 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // except with one less level of indirection.
   // This should expect gridCellStartIndices and gridCellEndIndices to refer
   // directly to pos and vel1.
+
   // - Identify the grid cell that this particle is in
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+
+  glm::ivec3 cellIndex = (pos[index] - gridMin) * inverseCellWidth;
+  cellIndex = cellIndex - 1;
+  cellIndex.x = imax(0, cellIndex.x);
+  cellIndex.y = imax(0, cellIndex.y);
+  cellIndex.z = imax(0, cellIndex.z);
+
+  glm::vec3 perceived_center = glm::vec3(0.0f);
+  glm::vec3 c = glm::vec3(0.0f);
+  glm::vec3 perceived_velocity = glm::vec3(0.0f);
+
+  float rule1N = 0;
+  float rule3N = 0;
+
   // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 2; j++) {
+      for (int k = 0; k < 2; k++) {
+        if (cellIndex.x + i < gridResolution && cellIndex.y + j < gridResolution && cellIndex.z + k < gridResolution) {
+          // - For each cell, read the start/end indices in the boid pointer array.
+          //   DIFFERENCE: For best results, consider what order the cells should be
+          //   checked in to maximize the memory benefits of reordering the boids data.
+          int gridIndex = gridIndex3Dto1D(cellIndex.x + i, cellIndex.y + j, cellIndex.z + k, gridResolution);
+          int gridStartIndex = gridCellStartIndices[gridIndex];
+          int gridEndIndex = gridCellEndIndices[gridIndex];
+
+          // - Access each boid in the cell and compute velocity change from
+          //   the boids rules, if this boid is within the neighborhood distance.
+          for (int currIndex = gridStartIndex; currIndex <= gridEndIndex; currIndex++) {
+            if (currIndex != index) {
+              float dist = glm::distance(pos[currIndex], pos[index]);
+
+              // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+              if (dist < rule1Distance) {
+                perceived_center += pos[currIndex];
+                rule1N++;
+              }
+
+              // Rule 2: boids try to stay a distance d away from each other
+              if (dist < rule2Distance) {
+                c -= (pos[currIndex] - pos[index]);
+              }
+
+              // Rule 3: boids try to match the speed of surrounding boids
+              if (dist < rule3Distance) {
+                perceived_velocity += vel1[currIndex];
+                rule3N++;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  if (rule1N > 0) {
+    perceived_center /= rule1N;
+    perceived_center = (perceived_center - pos[index]) * rule1Scale;
+  }
+
+  c *= rule2Scale;
+
+  if (rule3N > 0) {
+    perceived_velocity /= rule3N;
+    perceived_velocity *= rule3Scale;
+  }
+
+  glm::vec3 newVel = vel1[index] + perceived_center + c + perceived_velocity;
+
   // - Clamp the speed change before putting the new speed in vel2
+  if (glm::length(newVel) > maxSpeed) {
+    newVel = glm::normalize(newVel) * maxSpeed;
+  }
+
+  vel2[index] = newVel;
 }
 
 /**
@@ -348,40 +621,134 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+  int N = numObjects;
+  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+  kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> >(N, dev_pos,
+    dev_vel1, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+
+  kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(N, dt, dev_pos, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdatePos failed!");
+
   // TODO-1.2 ping-pong the velocity buffers
+  std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
   // TODO-2.1
   // Uniform Grid Neighbor search using Thrust sort.
+
   // In Parallel:
   // - label each particle with its array index as well as its grid index.
   //   Use 2x width grids.
+  int N = numObjects;
+  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+  dim3 NBlocksPerGrid((gridCellCount + blockSize - 1) / blockSize);
+
+  kernComputeIndices << <fullBlocksPerGrid, blockSize >> >(N, gridSideCount,
+    gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, 
+    dev_particleGridIndices);
+  checkCUDAErrorWithLine("kernComputeIndices failed!");
+
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + N, dev_thrust_particleArrayIndices);
+
+  kernResetIntBuffer << <NBlocksPerGrid, blockSize >> >(gridCellCount, dev_gridCellStartIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer for start indices failed!");
+
+  kernResetIntBuffer << <NBlocksPerGrid, blockSize >> >(gridCellCount, dev_gridCellEndIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer for end indices failed!");
+
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+  kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> >(N, dev_particleGridIndices,
+    dev_gridCellStartIndices, dev_gridCellEndIndices);
+  checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
   // - Perform velocity updates using neighbor search
+  kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> >(N, gridSideCount, gridMinimum,
+    gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices,
+    dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
   // - Update positions
+  kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(N, dt, dev_pos, dev_vel2);
+  checkCUDAErrorWithLine("kernUpdatePos failed!");
+
   // - Ping-pong buffers as needed
+  std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
   // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
   // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
   // In Parallel:
+
   // - Label each particle with its array index as well as its grid index.
   //   Use 2x width grids
+  int N = numObjects;
+  dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+  dim3 NBlocksPerGrid((gridCellCount + blockSize - 1) / blockSize);
+
+  kernComputeIndices << <fullBlocksPerGrid, threadsPerBlock >> >(N, gridSideCount,
+    gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices,
+    dev_particleGridIndices);
+  checkCUDAErrorWithLine("kernComputeIndices failed!");
+
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + N, dev_thrust_particleArrayIndices);
+
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
   // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
   //   the particle data in the simulation array.
   //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
-  // - Perform velocity updates using neighbor search
+  kernResetIntBuffer << <NBlocksPerGrid, threadsPerBlock >> >(gridCellCount, dev_gridCellStartIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer for start indices failed!");
+
+  kernResetIntBuffer << <NBlocksPerGrid, threadsPerBlock >> >(gridCellCount, dev_gridCellEndIndices, -1);
+  checkCUDAErrorWithLine("kernResetIntBuffer for end indices failed!");
+
+  kernIdentifyCellStartEnd << <fullBlocksPerGrid, threadsPerBlock >> >(N, dev_particleGridIndices,
+    dev_gridCellStartIndices, dev_gridCellEndIndices);
+  checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+  kernRearrangeDataPointers << <fullBlocksPerGrid, threadsPerBlock >> >(N, dev_pos, dev_pos_sorted,
+    dev_particleArrayIndices);
+  checkCUDAErrorWithLine("kernRearrangeDataPointers for dev_pos failed!");
+
+  kernRearrangeDataPointers << <fullBlocksPerGrid, threadsPerBlock >> >(N, dev_vel1, dev_vel1_sorted,
+    dev_particleArrayIndices);
+  checkCUDAErrorWithLine("kernRearrangeDataPointers for dev_vel1 failed!");
+
+  kernRearrangeDataPointers << <fullBlocksPerGrid, threadsPerBlock >> >(N, dev_vel2, dev_vel2_sorted,
+    dev_particleArrayIndices);
+  checkCUDAErrorWithLine("kernRearrangeDataPointers for dev_vel2 failed!");
+
+  //- Perform velocity updates using neighbor search
+  kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, threadsPerBlock >> >(N, gridSideCount, gridMinimum,
+    gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices,
+    dev_pos_sorted, dev_vel1_sorted, dev_vel2_sorted);
+  checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!");
+
   // - Update positions
+  kernUpdatePos << <fullBlocksPerGrid, threadsPerBlock >> >(N, dt, dev_pos_sorted, dev_vel2_sorted);
+  checkCUDAErrorWithLine("kernUpdatePos failed!");
+
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+  kernSwapRearrangedDataPointers << <fullBlocksPerGrid, threadsPerBlock >> >(N, dev_pos, dev_pos_sorted,
+    dev_particleArrayIndices);
+  checkCUDAErrorWithLine("kernSwapRearrangedDataPointers for dev_pos failed!");
+
+  kernSwapRearrangedDataPointers << <fullBlocksPerGrid, threadsPerBlock >> >(N, dev_vel1, dev_vel2_sorted,
+    dev_particleArrayIndices);
+  checkCUDAErrorWithLine("kernSwapRearrangedDataPointers for dev_vel1 failed!");
+
+  kernSwapRearrangedDataPointers << <fullBlocksPerGrid, threadsPerBlock >> >(N, dev_vel2, dev_vel1_sorted,
+    dev_particleArrayIndices);
+  checkCUDAErrorWithLine("kernSwapRearrangedDataPointers for dev_vel2 failed!");
 }
 
 void Boids::endSimulation() {
@@ -390,6 +757,14 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  
+  cudaFree(dev_pos_sorted);
+  cudaFree(dev_vel1_sorted);
+  cudaFree(dev_vel2_sorted);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index a29471d..4d96dd8 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 100000;
 const float DT = 0.2f;
 
 /**