diff --git a/CgMath/CgPointMath.h b/CgMath/CgPointMath.h
index 49d1ea0ce5cd940abbb0b64be13d55f4ed4a6ef9..edcebed1811baf142509b8f4b9f365a1a67834f1 100644
--- a/CgMath/CgPointMath.h
+++ b/CgMath/CgPointMath.h
@@ -15,6 +15,7 @@ using namespace Eigen;
 #include "CgAABB.h"
 
 struct PCA {
+    glm::vec3 centroid;
     glm::vec3 evec0;
     glm::vec3 evec1;
     glm::vec3 evec2;
@@ -173,6 +174,7 @@ inline PCA CgPointMath::calculatePCA(const std::vector<glm::vec3> &points) {
     Eigen::Vector3d evec1 = es.eigenvectors().col(1);
     Eigen::Vector3d evec2 = es.eigenvectors().col(2);
     return {
+        centroid,
         glm::vec3(evec0(0), evec0(1), evec0(2)),
         glm::vec3(evec1(0), evec1(1), evec1(2)),
         glm::vec3(evec2(0), evec2(1), evec2(2)),
@@ -182,7 +184,6 @@ inline PCA CgPointMath::calculatePCA(const std::vector<glm::vec3> &points) {
     };
 }
 
-// calculates an arbitrary verctor perpendicular to the given one
 inline glm::vec3 CgPointMath::getPerpendicularVector(glm::vec3 arg) {
   if((arg.x == 0.0) && (arg.y == 0.0)) {
     if(arg.z == 0.0) return glm::vec3(0.);
diff --git a/CgMath/CgPointWrangler.h b/CgMath/CgPointWrangler.h
index 3bea1a2104e21733e200a8ad65f404d489ce73e0..88c631587149f6a4d18a130765901847d1cab59d 100644
--- a/CgMath/CgPointWrangler.h
+++ b/CgMath/CgPointWrangler.h
@@ -22,16 +22,13 @@
 // this
 #include "CgAABB.h"
 #include "CgPointMath.h"
+#include "CgUtils/Timer.h"
 
 /*
 * more complex point cloud algorithms & data structures
 */
 class CgPointWrangler {
     public:
-
-        // calculate offset between begin and half-way pointer between begin and end
-        // requires end >= begin
-        static unsigned int halfSize(const glm::vec3* begin, const glm::vec3* end);
     
         // rearrange the vec3 between begin and end so they form a kd-tree
         static void buildKdTree(
@@ -53,7 +50,9 @@ class CgPointWrangler {
         );
 
         // naive estimation of normals
-        static std::vector<glm::vec3> estimateNormals(unsigned int neighbourhood_size, const std::vector<glm::vec3>& vertices);
+        static std::vector<std::vector<unsigned int>> getKNeighbourhoods(unsigned int neighbourhood_size, const std::vector<glm::vec3>& vertices);
+        static std::vector<glm::vec3> estimateNormals(const std::vector<glm::vec3>& vertices, const std::vector<std::vector<unsigned int>>& neighbourhoods);
+        static void alignNormals();
 
         static std::vector<unsigned int> getNearestNeighborsSlow(
             unsigned int current_point, 
@@ -69,6 +68,10 @@ class CgPointWrangler {
         );
 
     private:
+        // calculate offset between begin and half-way pointer between begin and end
+        // requires end >= begin
+        static unsigned int halfSize(const glm::vec3* begin, const glm::vec3* end);
+        
         CgPointWrangler(){}
 };
 
@@ -188,11 +191,30 @@ inline int CgPointWrangler::getClosestPointToRay(
   return erg;
 }
 
-// naive estimation of normals
-inline std::vector<glm::vec3> CgPointWrangler::estimateNormals(unsigned int neighbourhood_size, const std::vector<glm::vec3>& vertices) {
+inline std::vector<std::vector<unsigned int>> CgPointWrangler::getKNeighbourhoods(unsigned int neighbourhood_size, const std::vector<glm::vec3>& vertices) {
+    // all k-neighbourhoods
+    std::vector<std::vector<unsigned int>> neighbourhoods;
+    neighbourhoods.resize(vertices.size());
+    CgAABB aabb = CgPointMath::calculateAABB(vertices);
+
+    run_threaded(vertices.size(), [&](
+        const unsigned int start_index, 
+        const unsigned int count
+    ) {
+        for(unsigned int i = start_index; i < start_index + count; i++) {
+            neighbourhoods[i] = getNearestNeighborsFast(i, neighbourhood_size, vertices, aabb);
+        }
+    });
+
+    return neighbourhoods;
+}
+
+inline std::vector<glm::vec3> CgPointWrangler::estimateNormals(
+    const std::vector<glm::vec3>& vertices,
+    const std::vector<std::vector<unsigned int>>& neighbourhoods
+) {
     std::vector<glm::vec3> normals;
     normals.resize(vertices.size());
-    CgAABB aabb = CgPointMath::calculateAABB(vertices);
 
     // find point with highest y-value
     std::vector<bool> visited;
@@ -214,36 +236,22 @@ inline std::vector<glm::vec3> CgPointWrangler::estimateNormals(unsigned int neig
     // the "best" parent found for each vertex
     std::vector<std::pair<unsigned int, double>> scores;
     scores.resize(vertices.size());
-    // all k-neighbourhoods
-    std::vector<std::vector<unsigned int>> neighbourhoods;
-    neighbourhoods.resize(vertices.size());
 
-    // record all neighbourhoods and calculate (possibly flipped) normals
-    unsigned int thread_count = std::thread::hardware_concurrency();
-    std::vector<std::thread> threads;
-    threads.reserve(thread_count);
-
-    // worker for the threads that calculate all k-neighbourhoods
-    auto worker = [&](
+    // worker for doing the PCAs
+    run_threaded(vertices.size(), [&](
         const unsigned int start_index, 
         const unsigned int count
     ) {
         // thread-local buffer for points of one neighbourhood
-        std::vector<glm::vec3> nh(neighbourhood_size);
+        std::vector<glm::vec3> nh;
         for(unsigned int i = start_index; i < start_index + count; i++) {
-            neighbourhoods[i] = getNearestNeighborsFast(i, neighbourhood_size, vertices, aabb);
-            for(int j = 0; j < neighbourhood_size; j++) nh[j] = vertices[neighbourhoods[i][j]];
+            std::vector<unsigned int> nh_indices = neighbourhoods[i];
+            nh.clear();
+            nh.resize(nh_indices.size());
+            for(int j = 0; j < nh_indices.size(); j++) nh[j] = vertices[nh_indices[j]];
             normals[i] = CgPointMath::calculatePCA(nh).evec0;
         }
-    };
-
-    // start some workers
-    unsigned int slice_size = vertices.size() / thread_count;
-    unsigned int slice_rest = vertices.size() % thread_count;
-    unsigned int t = 0;
-    for(t = 0; t < thread_count - 1; t++) threads.push_back(std::thread(worker, t * slice_size, slice_size));
-    threads.push_back(std::thread(worker, t * slice_size, slice_size + slice_rest));
-    for(t = 0; t < thread_count; t++) threads[t].join();
+    });
 
     auto visit = [&](unsigned int curr, glm::vec3 parent_normal, glm::vec3 parent_pos) {
         // flip current normal to conform to parent
@@ -256,7 +264,7 @@ inline std::vector<glm::vec3> CgPointWrangler::estimateNormals(unsigned int neig
 
         // find out which of the neighbours we didn't process yet
         // and score them to find out in which order to process
-        for(unsigned int i = 0; i < neighbourhood_size; i++) {
+        for(unsigned int i = 0; i < curr_nh_ind.size(); i++) {
             unsigned int index = curr_nh_ind[i];
             // score by how far away it is from the local tangent plane
             // and how aligned the normals are.
@@ -295,6 +303,14 @@ inline std::vector<glm::vec3> CgPointWrangler::estimateNormals(unsigned int neig
     return normals;
 }
 
+inline void CgPointWrangler::alignNormals(
+   // glm::vec3 ref_normal, 
+   // const std::vector<unsigned int>& edges, 
+   // std::vector<glm::vec3>& normals
+) {
+
+}
+
 inline std::vector<unsigned int> CgPointWrangler::getNearestNeighborsSlow(
     unsigned int current_point, 
     unsigned int k, 
diff --git a/CgSceneGraph/CgSceneControl.cpp b/CgSceneGraph/CgSceneControl.cpp
index f1243bcc86487a790a0e988882a53bb96a8a3ef5..3373dc9db748c91bc47dd6233d48b830103ea5de 100644
--- a/CgSceneGraph/CgSceneControl.cpp
+++ b/CgSceneGraph/CgSceneControl.cpp
@@ -182,39 +182,76 @@ void CgSceneControl::handlePickEvent(float x, float y) {
 }
 
 void CgSceneControl::handleIncrementalSimplifyEvent(const CgSimplifyEvent& ev) {
+  unsigned int MAX_POINTS = 50;
   std::cout << ev << std::endl;
   // get current points
-  std::vector<glm::vec3> points = m_pointcloud->getVertices();
-  std::vector<glm::vec3> normals = m_pointcloud->getVertexNormals();
-  std::vector<glm::vec3> colors = m_pointcloud->getVertexColors();
-  std::vector<glm::mat4> splat_orientations = m_pointcloud->getSplatOrientations();
-  std::vector<glm::vec2> splat_scaling = m_pointcloud->getSplatScalings();
-  std::vector<unsigned int> splat_indices = m_pointcloud->getSplatIndices();
-
-  delete m_pointcloud;
-  // simplify by filtering the points
+  std::vector<glm::vec3> original_points = m_pointcloud->getVertices();
+  std::vector<glm::vec3> remaining_points;
+  std::vector<glm::vec3> current_cluster;
+  std::vector<bool> used;
+  used.resize(original_points.size(), false);
   std::vector<glm::vec3> new_points;
   std::vector<glm::vec3> new_normals;
   std::vector<glm::vec3> new_colors;
   std::vector<glm::mat4> new_splat_orientations;
-  std::vector<glm::vec2> new_splat_scaling;
+  std::vector<glm::vec2> new_splat_scalings;
   std::vector<unsigned int> new_splat_indices;
-  for(unsigned int i = 0; i < points.size(); i += 2) {
-    new_points.push_back(points[i]);
-    new_normals.push_back(normals[i]);
-    new_colors.push_back(colors[i]);
-    new_splat_orientations.push_back(splat_orientations[i]);
-    new_splat_scaling.push_back(splat_scaling[i]);
-    new_splat_indices.push_back(splat_indices[i]);
-  }
+  unsigned int next_seed = 0;
+
+  while(true) { // while there are still points to consider...
+    std::vector<unsigned int> nh = CgPointWrangler::getNearestNeighborsFast(
+      next_seed, MAX_POINTS + 1,
+      original_points,
+      CgPointMath::calculateAABB(original_points)
+    );
+    unsigned int i;
+    PCA cluster_pca;
+    current_cluster.push_back(original_points[next_seed]);
+    used[next_seed] = true;
+    for(i = 1; i < MAX_POINTS; i++) {
+      current_cluster.push_back(original_points[nh[i]]);
+      used[nh[i]] = true;
+      PCA current_pca = CgPointMath::calculatePCA(current_cluster);
+      float variance = current_pca.eval0 / (current_pca.eval0 + current_pca.eval1 + current_pca.eval2);
+      if(variance > ev.getMaxVariance()) break;
+      cluster_pca = current_pca;
+    }
+    next_seed = i + 1;
+    current_cluster.clear();
+    // push new point information
+    new_points.push_back(cluster_pca.centroid);
+    new_normals.push_back(cluster_pca.evec0);
+    new_colors.push_back(glm::vec3(0.5));
+    new_splat_orientations.push_back(glm::lookAt(
+      cluster_pca.centroid,
+      cluster_pca.centroid - cluster_pca.evec0,
+      CgPointMath::getPerpendicularVector(cluster_pca.evec0)
+    ));
+    new_splat_scalings.push_back(glm::vec2(0.02));
+    new_splat_indices.push_back(new_points.size() - 1);
+
+    // copy unused points over
+    for(unsigned int i = 0; i < used.size(); i++) {
+      if(!used[i]) remaining_points.push_back(original_points[i]);
+      if(i == next_seed) next_seed = remaining_points.size() - 1;
+      used[i] = false;
+    }
+    original_points = remaining_points;
 
+    if(original_points.size() == 0) break;
+    // build new kd-tree in remaining points.
+    CgPointWrangler::buildKdTree(original_points.data(), original_points.data() + original_points.size(), 0);
+    remaining_points.clear();
+    used.resize(original_points.size(), false);
+  }
+  std::cout << new_points.size() << " clusters" << std::endl;
   // re-init the new point cloud with the new point list
   m_pointcloud = new CgPointCloud(
     new_points,
     new_normals,
     new_colors,
     new_splat_orientations,
-    new_splat_scaling,
+    new_splat_scalings,
     new_splat_indices
   );
 }
@@ -337,7 +374,7 @@ void CgSceneControl::handleEvent(CgBaseEvent* e) {
         delete m_pointcloud;
         m_pointcloud = copy;
       } else if(ev->getClusteringType() == Cg::CgIncremental) {
-        handleIncrementalSimplifyEvent(*ev);
+        run_timed("incr. simplify", 1, [&](){handleIncrementalSimplifyEvent(*ev);});
       } else {
         handleHierarchicalSimplifyEvent(*ev);
       }
@@ -370,6 +407,7 @@ void CgSceneControl::handleEvent(CgBaseEvent* e) {
           }
 
           std::vector<glm::vec3> positions;
+          std::vector<std::vector<unsigned int>> neighbourhoods;
           std::vector<glm::vec3> normals;
           std::vector<glm::vec3> colors;
           std::vector<glm::mat4> splat_orientations;
@@ -379,8 +417,15 @@ void CgSceneControl::handleEvent(CgBaseEvent* e) {
 
           std::cout << "loaded " << positions.size() << " points" << std::endl;
           std::cout << "using " << std::thread::hardware_concurrency() << " threads" << std::endl;
-          run_timed("build kd", 1, [&](){CgPointWrangler::buildKdTree(positions.data(), positions.data() + positions.size(), 0);});
-          run_timed("est. normals", 1, [&](){normals = CgPointWrangler::estimateNormals(15, positions);});
+          run_timed("build kd", 1, 
+            [&](){CgPointWrangler::buildKdTree(positions.data(), positions.data() + positions.size(), 0);}
+          );
+          run_timed("get neighbourhoods", 1, 
+            [&](){neighbourhoods = CgPointWrangler::getKNeighbourhoods(15, positions);}
+          );
+          run_timed("est. normals", 1, 
+            [&](){normals = CgPointWrangler::estimateNormals(positions, neighbourhoods);}
+          );
           colors.resize(positions.size());
           splat_orientations.resize(positions.size());
           splat_scalings.resize(positions.size());
diff --git a/CgSceneGraph/CgSceneControl.h b/CgSceneGraph/CgSceneControl.h
index 20689b6dd265f5d672f6ace89db0d7cad02f9a4a..c0560364764fcceaf54c4485790b67bc93970d2f 100644
--- a/CgSceneGraph/CgSceneControl.h
+++ b/CgSceneGraph/CgSceneControl.h
@@ -4,6 +4,8 @@
 #include "CgBase/CgObserver.h"
 #include "CgBase/CgBaseSceneControl.h"
 #include "CgMath/CgAABB.h"
+#include "CgMath/CgPointMath.h"
+#include "CgMath/CgPointWrangler.h"
 #include <glm/glm.hpp>
 #include <vector>
 #include <queue>
diff --git a/CgUtils/Timer.h b/CgUtils/Timer.h
index 799c6f80d5806e2d74beaa29ebcb92d0b8011547..3e13464a880b4b04abe70668627e09e0c8db1862 100644
--- a/CgUtils/Timer.h
+++ b/CgUtils/Timer.h
@@ -4,6 +4,7 @@
 
 #include <iostream>
 #include <functional>
+#include <thread>
 #include <chrono>
 
 // benchmarking utility
@@ -17,4 +18,24 @@ inline void run_timed(std::string tag, unsigned int amount, std::function<void (
     std::cout << "average for " << tag << ": "<< ms_double.count() / double(amount) << " ms" << std::endl;
 }
 
+// threading utility
+// distributes some amount of work over a number of threads
+// workers are lambdas that take a start index and a count of
+// jobs they should do 
+inline void run_threaded(unsigned int job_count, const std::function<void (unsigned int, unsigned int)>& worker) {
+
+    // record all neighbourhoods and calculate (possibly flipped) normals
+    unsigned int thread_count = std::thread::hardware_concurrency();
+    std::vector<std::thread> threads;
+    threads.reserve(thread_count);
+
+    // start some workers
+    unsigned int slice_size = job_count / thread_count;
+    unsigned int slice_rest = job_count % thread_count;
+    unsigned int t = 0;
+    for(t = 0; t < thread_count - 1; t++) threads.push_back(std::thread(worker, t * slice_size, slice_size));
+    threads.push_back(std::thread(worker, t * slice_size, slice_size + slice_rest));
+    for(t = 0; t < thread_count; t++) threads[t].join();
+}
+
 #endif // TIMER_H
\ No newline at end of file