From 9a8340f15ab86be3119f25e224892f8839bac23d Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 15:35:28 -0700
Subject: [PATCH 01/14] export compile commands

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b3e903..2b69752 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,7 @@ set (CMAKE_PROJECT_VERSION_MAJOR 1)
 set (CMAKE_PROJECT_VERSION_MINOR 0)
 set (CMAKE_PROJECT_VERSION_PATH 0)
 set (CMAKE_SKIP_RULE_DEPENDENCY TRUE)
+set (CMAKE_EXPORT_COMPILE_COMMANDS 1)
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
 # Options

From d11dca491d5e2341f0e61686416471e5fef1ffd1 Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 15:46:17 -0700
Subject: [PATCH 02/14] update gitignore

---
 .gitignore | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 0fc4f9c..5b8c3ab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 tsne_utils
+compile_commands.json
 
 *.o
 *.so
@@ -24,7 +25,7 @@ Multicore-TSNE/dist/*
 build/*
 
 # exception to the rule
-!build/.gitkeep 
+!build/.gitkeep
 
 # Ignore animation
 visualization/animation.gif
@@ -32,3 +33,7 @@ visualization/animation.gif
 .DS_Store
 .mypy_cache/
 .vscode/
+
+# vim
+*.swp
+*.swo

From 6fe82dac7bdd29b661e70fff314a36771565302f Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 15:47:54 -0700
Subject: [PATCH 03/14] remove compute architectures for easier compilation
 when debugging

---
 CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b69752..a9e0855 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,13 +58,13 @@ SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};
                     -O3
                     -Xptxas -dlcm=cg
-                     -gencode=arch=compute_30,code=sm_30
-                     -gencode=arch=compute_35,code=sm_35
-                     -gencode=arch=compute_50,code=sm_50
-                     -gencode=arch=compute_52,code=sm_52
-		             -gencode=arch=compute_60,code=sm_60
+                    # -gencode=arch=compute_30,code=sm_30
+                    # -gencode=arch=compute_35,code=sm_35
+                    # -gencode=arch=compute_50,code=sm_50
+                    # -gencode=arch=compute_52,code=sm_52
+                    # -gencode=arch=compute_60,code=sm_60
 		             -gencode=arch=compute_61,code=sm_61
-                     -gencode=arch=compute_70,code=sm_70
+                    # -gencode=arch=compute_70,code=sm_70
                     -std=c++11
                     -Xcompiler '-O3'
                     -Xcompiler '-fPIC'

From da8ee2c91b123c5c307bba6bb9b0a2accb58e060 Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 16:40:57 -0700
Subject: [PATCH 04/14] delete unused files, naive tsne files, bh tsne
 namespace

---
 CMakeLists.txt                                |   6 +-
 src/Makefile                                  |  15 -
 src/exe/main.cu                               |  33 +-
 src/ext/pymodule_ext.cu                       |   4 +-
 src/{bh_tsne.cu => fit_tsne.cu}               |  20 +-
 src/include/ext/pymodule_ext.h                |   9 +-
 src/include/{bh_tsne.h => fit_tsne.h}         |   5 +-
 src/include/kernels/apply_forces.h            |  37 +-
 .../{bh_attr_forces.h => attr_forces.h}       |   9 +-
 src/include/kernels/bh_rep_forces.h           |  62 ---
 src/include/kernels/bounding_box.h            |  53 ---
 src/include/kernels/initialization.h          |  30 --
 src/include/kernels/perplexity_search.h       |  30 +-
 src/include/kernels/tree_builder.h            |  49 ---
 src/include/kernels/tree_sort.h               |  42 --
 src/include/kernels/tree_summary.h            |  45 --
 src/include/naive_tsne.h                      | 149 -------
 src/include/naive_tsne_cpu.h                  |  87 ----
 src/include/nbodyfft.h                        |  41 +-
 src/include/tsne_vars.h                       |  20 -
 src/kernels/apply_forces.cu                   |  72 +--
 .../{bh_attr_forces.cu => attr_forces.cu}     |   8 +-
 src/kernels/bh_rep_forces.cu                  | 171 -------
 src/kernels/bounding_box.cu                   | 133 ------
 src/kernels/initialization.cu                 |   9 +-
 src/kernels/perplexity_search.cu              |  96 +---
 src/kernels/tree_builder.cu                   | 189 --------
 src/kernels/tree_sort.cu                      |  72 ---
 src/kernels/tree_summary.cu                   | 166 -------
 src/naive_tsne.cu                             | 416 ------------------
 src/naive_tsne_cpu.cu                         | 183 --------
 src/nbodyfft.cu                               | 124 +++---
 32 files changed, 139 insertions(+), 2246 deletions(-)
 delete mode 100644 src/Makefile
 rename src/{bh_tsne.cu => fit_tsne.cu} (98%)
 rename src/include/{bh_tsne.h => fit_tsne.h} (87%)
 rename src/include/kernels/{bh_attr_forces.h => attr_forces.h} (92%)
 delete mode 100644 src/include/kernels/bh_rep_forces.h
 delete mode 100644 src/include/kernels/bounding_box.h
 delete mode 100644 src/include/kernels/initialization.h
 delete mode 100644 src/include/kernels/tree_builder.h
 delete mode 100644 src/include/kernels/tree_sort.h
 delete mode 100644 src/include/kernels/tree_summary.h
 delete mode 100644 src/include/naive_tsne.h
 delete mode 100644 src/include/naive_tsne_cpu.h
 delete mode 100644 src/include/tsne_vars.h
 rename src/kernels/{bh_attr_forces.cu => attr_forces.cu} (92%)
 delete mode 100644 src/kernels/bh_rep_forces.cu
 delete mode 100644 src/kernels/bounding_box.cu
 delete mode 100644 src/kernels/tree_builder.cu
 delete mode 100644 src/kernels/tree_sort.cu
 delete mode 100644 src/kernels/tree_summary.cu
 delete mode 100644 src/naive_tsne.cu
 delete mode 100644 src/naive_tsne_cpu.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a9e0855..62a94e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,15 +150,13 @@ set(SOURCES
 
     # Kernels
     src/kernels/apply_forces.cu
-    src/kernels/bh_attr_forces.cu
+    src/kernels/attr_forces.cu
     src/kernels/perplexity_search.cu
-    src/kernels/initialization.cu
 
     # Method files
     src/ext/pymodule_ext.cu
-    src/bh_tsne.cu
+    src/fit_tsne.cu
     src/nbodyfft.cu
-    # src/naive_tsne.cu
 )
 
 
diff --git a/src/Makefile b/src/Makefile
deleted file mode 100644
index a3c5243..0000000
--- a/src/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-APPS := bh
-FLAGS := -O3 -arch=sm_61 -g -Xptxas -v
-NVCC := nvcc
-GCC := g++
-CC := $(GCC)
-
-.PHONY: all
-
-all: bh sparse
-
-bh : bh.cu
-	$(NVCC) $(FLAGS) -DVARIANT=0 -o $@ $<
-
-sparse : sparse.cu
-	$(NVCC) $(FLAGS) -lcusparse -o $@ $<
diff --git a/src/exe/main.cu b/src/exe/main.cu
index ce4c95d..a3432f1 100644
--- a/src/exe/main.cu
+++ b/src/exe/main.cu
@@ -1,16 +1,13 @@
-
-// This file exposes a main file which does most of the testing with command line 
+// This file exposes a main file which does most of the testing with command line
 // args, so we don't have to re-build to change options.
 
 // Detailed includes
-#include "common.h"
-#include "util/data_utils.h"
-#include "util/cuda_utils.h"
-#include "util/random_utils.h"
-#include "util/distance_utils.h"
-#include "naive_tsne.h"
-#include "naive_tsne_cpu.h"
-#include "bh_tsne.h"
+// #include "../include/common.h"
+#include "../include/util/data_utils.h"
+// #include "../include/util/cuda_utils.h"
+// #include "../include/util/random_utils.h"
+// #include "../include/util/distance_utils.h"
+#include "../include/fit_tsne.h"
 #include <time.h>
 #include <string>
 
@@ -45,7 +42,7 @@ int main(int argc, char** argv) {
         ("q,dim", "Point Dimensions", cxxopts::value<int>()->default_value("50"))
         ("j,device", "Device to run on", cxxopts::value<int>()->default_value("0"))
         ("h,help", "Print help");
-    
+
     // Parse command line options
     auto result = options.parse(argc, argv);
 
@@ -98,7 +95,7 @@ int main(int argc, char** argv) {
         }
 
         // Do the t-SNE
-        tsnecuda::bh::RunTsne(opt, gpu_opt);
+        tsnecuda::RunTsne(opt, gpu_opt);
 
         // Clean up the data
         delete[] data;
@@ -134,7 +131,7 @@ int main(int argc, char** argv) {
         }
 
         // Do the t-SNE
-        tsnecuda::bh::RunTsne(opt, gpu_opt);
+        tsnecuda::RunTsne(opt, gpu_opt);
 
         // Clean up the data
         delete[] data;
@@ -151,7 +148,7 @@ int main(int argc, char** argv) {
 
         // DO the T-SNE
         printf("Starting TSNE calculation with %u points.\n", num_images);
-        
+
         // Construct the options
         tsnecuda::Options opt(nullptr, data, num_images, num_columns*num_rows*num_channels);
         opt.perplexity = FOPT(perplexity);
@@ -171,8 +168,8 @@ int main(int argc, char** argv) {
         }
 
         // Do the t-SNE
-        tsnecuda::bh::RunTsne(opt, gpu_opt);
-        
+        tsnecuda::RunTsne(opt, gpu_opt);
+
         // Clean up the data
         delete[] data;
 
@@ -194,7 +191,7 @@ int main(int argc, char** argv) {
 
         // Do the T-SNE
         printf("Starting TSNE calculation with %u points.\n", IOPT(num-points));
-        
+
         // Construct the options
         tsnecuda::Options opt(nullptr, thrust::raw_pointer_cast(h_X.data()), IOPT(num-points),  IOPT(dim));
         opt.perplexity = FOPT(perplexity);
@@ -214,7 +211,7 @@ int main(int argc, char** argv) {
         }
 
         // Do the t-SNE
-        tsnecuda::bh::RunTsne(opt, gpu_opt);
+        tsnecuda::RunTsne(opt, gpu_opt);
 
     } else {
         std::cout << "Dataset not recognized..." << std::endl;
diff --git a/src/ext/pymodule_ext.cu b/src/ext/pymodule_ext.cu
index 567b13b..7c2dff3 100644
--- a/src/ext/pymodule_ext.cu
+++ b/src/ext/pymodule_ext.cu
@@ -1,7 +1,7 @@
 
 // Implementation file for the python extensions
 
-#include "include/ext/pymodule_ext.h"
+#include "../include/ext/pymodule_ext.h"
 
 void pymodule_bh_tsne(float *result,
                       float* points,
@@ -108,7 +108,7 @@ void pymodule_bh_tsne(float *result,
     }
 
     // Do the t-SNE
-    tsnecuda::bh::RunTsne(opt, gpu_opt);
+    tsnecuda::RunTsne(opt, gpu_opt);
 
     // Copy the data back from the GPU
     cudaDeviceSynchronize();
diff --git a/src/bh_tsne.cu b/src/fit_tsne.cu
similarity index 98%
rename from src/bh_tsne.cu
rename to src/fit_tsne.cu
index 7183474..28cffc2 100644
--- a/src/bh_tsne.cu
+++ b/src/fit_tsne.cu
@@ -2,8 +2,8 @@
     Compute t-SNE via Barnes-Hut for NlogN time.
 */
 
-#include "bh_tsne.h"
-#include "nbodyfft.h"
+#include "include/fit_tsne.h"
+#include "include/nbodyfft.h"
 #include <chrono>
 
 #define cufftSafeCall(err)  __cufftSafeCall(err, __FILE__, __LINE__)
@@ -200,8 +200,8 @@ void compute_chargesQij(
         num_points, n_terms);
 }
 
-void tsnecuda::bh::RunTsne(tsnecuda::Options &opt,
-                            tsnecuda::GpuOptions &gpu_opt)
+void tsnecuda::RunTsne(tsnecuda::Options &opt,
+                       tsnecuda::GpuOptions &gpu_opt)
 {
     auto start = std::chrono::high_resolution_clock::now();
     auto stop = std::chrono::high_resolution_clock::now();
@@ -279,7 +279,7 @@ void tsnecuda::bh::RunTsne(tsnecuda::Options &opt,
 
     // Initialize global variables
     thrust::device_vector<int> err_device(1);
-    tsnecuda::bh::Initialize(gpu_opt, err_device);
+    // tsnecuda::Initialize(gpu_opt, err_device);
 
     END_IL_TIMER(_time_initialization);
     START_IL_TIMER();
@@ -301,7 +301,7 @@ void tsnecuda::bh::RunTsne(tsnecuda::Options &opt,
 
     // Search Perplexity
     thrust::device_vector<float> pij_non_symmetric_device(num_points * num_neighbors);
-    tsnecuda::bh::SearchPerplexity(gpu_opt, dense_handle, pij_non_symmetric_device, knn_squared_distances_device,
+    tsnecuda::SearchPerplexity(gpu_opt, dense_handle, pij_non_symmetric_device, knn_squared_distances_device,
                                     perplexity, perplexity_search_epsilon, num_points, num_neighbors);
 
     // Clean up memory
@@ -560,7 +560,7 @@ void tsnecuda::bh::RunTsne(tsnecuda::Options &opt,
         // Compute the number of boxes in a single dimension and the total number of boxes in 2d
         // auto n_boxes_per_dim = static_cast<int>(fmax(min_num_intervals, (max_coord - min_coord) / intervals_per_integer));
 
-        precompute_2d(
+        tsnecuda::PrecomputeFFT2D(
             plan_kernel_tilde, max_coord, min_coord, max_coord, min_coord, n_boxes_per_dim, n_interpolation_points,
             box_lower_bounds_device, box_upper_bounds_device, kernel_tilde_device,
             fft_kernel_tilde_device);
@@ -571,7 +571,7 @@ void tsnecuda::bh::RunTsne(tsnecuda::Options &opt,
         START_IL_TIMER();
 
 
-        n_body_fft_2d(
+        tsnecuda::NbodyFFT2D(
             plan_dft, plan_idft,
             N, n_terms, n_boxes_per_dim, n_interpolation_points,
             fft_kernel_tilde_device, n_total_boxes,
@@ -600,7 +600,7 @@ void tsnecuda::bh::RunTsne(tsnecuda::Options &opt,
 
 
         // Calculate Attractive Forces
-        tsnecuda::bh::ComputeAttractiveForces(gpu_opt,
+        tsnecuda::ComputeAttractiveForces(gpu_opt,
                                               sparse_handle,
                                               sparse_matrix_descriptor,
                                               attractive_forces_device,
@@ -618,7 +618,7 @@ void tsnecuda::bh::RunTsne(tsnecuda::Options &opt,
         START_IL_TIMER();
 
         // Apply Forces
-        tsnecuda::bh::ApplyForces(gpu_opt,
+        tsnecuda::ApplyForces(gpu_opt,
                                   points_device,
                                   attractive_forces_device,
                                   repulsive_forces_device,
diff --git a/src/include/ext/pymodule_ext.h b/src/include/ext/pymodule_ext.h
index f6ce78e..ef9f6e9 100644
--- a/src/include/ext/pymodule_ext.h
+++ b/src/include/ext/pymodule_ext.h
@@ -1,12 +1,11 @@
-
 #ifndef PYMODULE_EXT_H
 #define PYMODULE_EXT_H
 
     #include <sys/types.h>
-    #include "common.h"
-    #include "include/options.h"
-    #include "util/distance_utils.h"
-    #include "bh_tsne.h"
+    #include "../common.h"
+    #include "../options.h"
+    #include "../util/distance_utils.h"
+    #include "../fit_tsne.h"
 
     extern "C" {
         /**
diff --git a/src/include/bh_tsne.h b/src/include/fit_tsne.h
similarity index 87%
rename from src/include/bh_tsne.h
rename to src/include/fit_tsne.h
index ae0c2eb..87387d9 100644
--- a/src/include/bh_tsne.h
+++ b/src/include/fit_tsne.h
@@ -21,15 +21,12 @@
 #include "include/util/thrust_transform_functions.h"
 
 #include "include/kernels/apply_forces.h"
-#include "include/kernels/bh_attr_forces.h"
+#include "include/kernels/attr_forces.h"
 #include "include/kernels/perplexity_search.h"
-#include "include/kernels/initialization.h"
 
 namespace tsnecuda {
-namespace bh {
 void RunTsne(tsnecuda::Options &opt, tsnecuda::GpuOptions &gpu_opt);
 }
-}
 
 
 #endif
diff --git a/src/include/kernels/apply_forces.h b/src/include/kernels/apply_forces.h
index 7d399f4..9aeb815 100644
--- a/src/include/kernels/apply_forces.h
+++ b/src/include/kernels/apply_forces.h
@@ -10,12 +10,11 @@
 #ifndef SRC_INCLUDE_KERNELS_APPLY_FORCES_H_
 #define SRC_INCLUDE_KERNELS_APPLY_FORCES_H_
 
-#include "include/common.h"
-#include "include/options.h"
-#include "include/util/cuda_utils.h"
- 
+#include "../common.h"
+#include "../options.h"
+#include "../util/cuda_utils.h"
+
 namespace tsnecuda {
-namespace bh {
 __global__
 void IntegrationKernel(
                                  volatile float * __restrict__ points,
@@ -36,7 +35,7 @@ void ApplyForces(
                     thrust::device_vector<float> &points,
                     thrust::device_vector<float> &attr_forces,
                     thrust::device_vector<float> &rep_forces,
-                    thrust::device_vector<float> &gains, 
+                    thrust::device_vector<float> &gains,
                     thrust::device_vector<float> &old_forces,
                     const float eta,
                     const float normalization,
@@ -47,30 +46,4 @@ void ApplyForces(
                     const int num_blocks
             );
 }
-
-namespace naive {
-
-__global__
-void IntegrationKernel(
-                                 volatile float * __restrict__ points,
-                                 volatile float * __restrict__ forces,
-                                 volatile float * __restrict__ gains,
-                                 volatile float * __restrict__ old_forces,
-                                 const float eta,
-                                 const float momentum,
-                                 const int num_points
-                      );
-
-
-void ApplyForces(thrust::device_vector<float> &points,
-                                  thrust::device_vector<float> &forces,
-                                  thrust::device_vector<float> &gains,
-                                  thrust::device_vector<float> &old_forces,
-                                  const float eta,
-                                  const float momentum,
-                                  const int num_points,
-                                  const int num_blocks);
-
-}
-}
 #endif
diff --git a/src/include/kernels/bh_attr_forces.h b/src/include/kernels/attr_forces.h
similarity index 92%
rename from src/include/kernels/bh_attr_forces.h
rename to src/include/kernels/attr_forces.h
index 174fe1e..59e3404 100644
--- a/src/include/kernels/bh_attr_forces.h
+++ b/src/include/kernels/attr_forces.h
@@ -9,12 +9,12 @@
 #ifndef SRC_INCLUDE_KERNELS_BH_ATTR_FORCES_H_
 #define SRC_INCLUDE_KERNELS_BH_ATTR_FORCES_H_
 
-#include "include/common.h"
-#include "include/options.h"
-#include "include/util/cuda_utils.h"
+#include "../common.h"
+#include "../options.h"
+#include "../util/cuda_utils.h"
 
 namespace tsnecuda {
-namespace bh {
+
  __global__
 void ComputePijxQijKernel(
                             float * __restrict__ attr_forces,
@@ -39,6 +39,5 @@ void ComputeAttractiveForces(
                     const int num_points,
                     const int num_nonzero);
 }
-}
 
 #endif
diff --git a/src/include/kernels/bh_rep_forces.h b/src/include/kernels/bh_rep_forces.h
deleted file mode 100644
index 54a9cb7..0000000
--- a/src/include/kernels/bh_rep_forces.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/**
- * @brief Kernels for computing t-SNE repulsive forces with barnes hut approximation.
- *
- * @file apply_forces.cu
- * @author Roshan Rao
- * @date 2018-05-08
- * Copyright (c) 2018, Regents of the University of California
- */
-#ifndef SRC_INCLUDE_KERNELS_BH_REP_FORCES_H_
-#define SRC_INCLUDE_KERNELS_BH_REP_FORCES_H_
-
-#include "include/common.h"
-#include "include/options.h"
-#include "include/util/cuda_utils.h"
-
-// TSNE-Vars
-extern __device__ volatile int stepd, bottomd, maxdepthd;
-extern __device__ unsigned int blkcntd;
-extern __device__ volatile float radiusd;
-
-namespace tsnecuda {
-namespace bh {
-
-/******************************************************************************/
-/*** compute force ************************************************************/
-/******************************************************************************/
-
-__global__
-void ForceCalculationKernel(volatile int * __restrict__ errd,
-                                          volatile float * __restrict__ x_vel_device,
-                                          volatile float * __restrict__ y_vel_device,
-                                          volatile float * __restrict__ normalization_vec_device,
-                                          const int * __restrict__ cell_sorted,
-                                          const int * __restrict__ children,
-                                          const float * __restrict__ cell_mass,
-                                          volatile float * __restrict__ x_pos_device,
-                                          volatile float * __restrict__ y_pos_device,
-                                          const float theta,
-                                          const float epsilon,
-                                          const int num_nodes,
-                                          const int num_points,
-                                          const int maxdepth_bh_tree,
-                                          const int repulsive_force_threads);
-
-void ComputeRepulsiveForces(tsnecuda::GpuOptions &gpu_opt,
-                                        thrust::device_vector<int> &errd,
-                                        thrust::device_vector<float> &repulsive_forces,
-                                        thrust::device_vector<float> &normalization_vec,
-                                        thrust::device_vector<int> &cell_sorted,
-                                        thrust::device_vector<int> &children,
-                                        thrust::device_vector<float> &cell_mass,
-                                        thrust::device_vector<float> &points,
-                                        const float theta,
-                                        const float epsilon,
-                                        const int num_nodes,
-                                        const int num_points,
-                                        const int num_blocks);
- 
-}
-}
-
-#endif
diff --git a/src/include/kernels/bounding_box.h b/src/include/kernels/bounding_box.h
deleted file mode 100644
index 573154e..0000000
--- a/src/include/kernels/bounding_box.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/**
- * @brief Kernels for computing bounding box of points in 2 dimensions.
- *
- * @file apply_forces.cu
- * @author Roshan Rao
- * @date 2018-05-08
- * Copyright (c) 2018, Regents of the University of California
- */
-#ifndef SRC_INCLUDE_KERNELS_BOUNDING_BOX_H_
-#define SRC_INCLUDE_KERNELS_BOUNDING_BOX_H_
-
-#include "include/common.h"
-#include "include/options.h"
-#include "include/util/cuda_utils.h"
-
-//TSNE-Vars
-extern __device__ volatile int stepd, bottomd, maxdepthd;
-extern __device__ unsigned int blkcntd;
-extern __device__ volatile float radiusd;
-
-namespace tsnecuda {
-namespace bh {
-__global__
-void BoundingBoxKernel(
-                       volatile int * __restrict__ cell_starts, 
-                       volatile int * __restrict__ children, 
-                       volatile float * __restrict__ cell_mass, 
-                       volatile float * __restrict__ x_pos_device, 
-                       volatile float * __restrict__ y_pos_device, 
-                       volatile float * __restrict__ x_max_device, 
-                       volatile float * __restrict__ y_max_device, 
-                       volatile float * __restrict__ x_min_device, 
-                       volatile float * __restrict__ y_min_device,
-                       const int num_nodes,
-                       const int num_points,
-                       const int bounding_box_threads);
-
-void ComputeBoundingBox(tsnecuda::GpuOptions &gpu_opt,
-                                thrust::device_vector<int> &cell_starts,
-                                thrust::device_vector<int> &children,
-                                thrust::device_vector<float> &cell_mass,
-                                thrust::device_vector<float> &points,
-                                thrust::device_vector<float> &x_max_device,
-                                thrust::device_vector<float> &y_max_device,
-                                thrust::device_vector<float> &x_min_device,
-                                thrust::device_vector<float> &y_min_device,
-                                const int num_nodes,
-                                const int num_points,
-                                const int num_blocks);
-}
-}
-
-#endif
diff --git a/src/include/kernels/initialization.h b/src/include/kernels/initialization.h
deleted file mode 100644
index 199abab..0000000
--- a/src/include/kernels/initialization.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/**
- * @brief Kernels for computing t-SNE attractive forces with nearest neighbor approximation.
- *
- * @file apply_forces.cu
- * @author Roshan Rao
- * @date 2018-05-08
- * Copyright (c) 2018, Regents of the University of California
- */
-#ifndef SRC_INCLUDE_KERNELS_INITIALIZATION_H_
-#define SRC_INCLUDE_KERNELS_INITIALIZATION_H_
-
-#include "include/common.h"
-#include "include/options.h"
-#include "include/util/cuda_utils.h"
-
-#include "include/kernels/apply_forces.h"
-#include "include/kernels/bh_attr_forces.h"
-#include "include/kernels/perplexity_search.h"
-
-namespace tsnecuda {
-namespace bh {
-__global__ void InitializationKernel(int * __restrict errd);
-void Initialize(tsnecuda::GpuOptions &gpu_opt, thrust::device_vector<int> &errd);
-}
-namespace naive {
-void Initialize();
-}
-}
-
-#endif
diff --git a/src/include/kernels/perplexity_search.h b/src/include/kernels/perplexity_search.h
index 1d05576..94d76e0 100644
--- a/src/include/kernels/perplexity_search.h
+++ b/src/include/kernels/perplexity_search.h
@@ -9,12 +9,12 @@
 #ifndef SRC_INCLUDE_KERNELS_PERPLEXITY_SEARCH_H_
 #define SRC_INCLUDE_KERNELS_PERPLEXITY_SEARCH_H_
 
-#include "include/common.h"
-#include "include/options.h"
-#include "include/util/cuda_utils.h"
-#include "include/util/reduce_utils.h"
-#include "include/util/matrix_broadcast_utils.h"
-#include "include/util/thrust_transform_functions.h"
+#include "../common.h"
+#include "../options.h"
+#include "../util/cuda_utils.h"
+#include "../util/reduce_utils.h"
+#include "../util/matrix_broadcast_utils.h"
+#include "../util/thrust_transform_functions.h"
 
 namespace tsnecuda {
 __global__
@@ -28,7 +28,6 @@ void PerplexitySearchKernel(
                             const float perplexity_target,
                             const float epsilon,
                             const int num_points);
-namespace bh {
 __global__
 void ComputePijKernel(
                       volatile float * __restrict__ pij,
@@ -47,21 +46,4 @@ void SearchPerplexity(tsnecuda::GpuOptions &gpu_opt,
                             const int num_near_neighbors);
 }
 
-namespace naive {
-__global__
-void ComputePijKernel(
-                        volatile float * __restrict__ pij,
-                        const float * __restrict__ squared_dist,
-                        const float * __restrict__ betas,
-                        const unsigned int num_points);
-
-void SearchPerplexity(cublasHandle_t &handle,
-                                     thrust::device_vector<float> &pij,
-                                     thrust::device_vector<float> &squared_dist,
-                                     const float perplexity_target,
-                                     const float epsilon,
-                                     const int num_points);
-}
-}
-
 #endif
diff --git a/src/include/kernels/tree_builder.h b/src/include/kernels/tree_builder.h
deleted file mode 100644
index b56012d..0000000
--- a/src/include/kernels/tree_builder.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * @brief Kernels for computing t-SNE attractive forces with nearest neighbor approximation.
- *
- * @file apply_forces.cu
- * @author Roshan Rao
- * @date 2018-05-08
- * Copyright (c) 2018, Regents of the University of California
- */
-#ifndef SRC_INCLUDE_KERNELS_TREE_BUILDER_H_
-#define SRC_INCLUDE_KERNELS_TREE_BUILDER_H_
-
-#include "include/common.h"
-#include "include/options.h"
-#include "include/util/cuda_utils.h"
-
-//TSNE-Vars
-extern __device__ volatile int stepd, bottomd, maxdepthd;
-extern __device__ unsigned int blkcntd;
-extern __device__ volatile float radiusd;
-
-namespace tsnecuda {
-namespace bh {
-__global__
-void ClearKernel1(volatile int * __restrict__ children, const int num_nodes, const int num_points);
-
-__global__
-void TreeBuildingKernel(volatile int * __restrict__ errd, 
-                                        volatile int * __restrict__ children, 
-                                        volatile float * __restrict__ x_pos_device, 
-                                        volatile float * __restrict__ y_pos_device,
-                                        const int num_nodes,
-                                        const int num_points);
-
-__global__
-void ClearKernel2(volatile int * __restrict__ cell_starts, volatile float * __restrict__ cell_mass, const int num_nodes);
-
-void BuildTree(tsnecuda::GpuOptions &gpu_opt,
-                    thrust::device_vector<int> &errd,
-                    thrust::device_vector<int> &children,
-                    thrust::device_vector<int> &cell_starts,
-                    thrust::device_vector<float> &cell_mass,
-                    thrust::device_vector<float> &points,
-                    const int num_nodes,
-                    const int num_points,
-                    const int num_blocks);
-}
-}
-
-#endif
diff --git a/src/include/kernels/tree_sort.h b/src/include/kernels/tree_sort.h
deleted file mode 100644
index 7f66193..0000000
--- a/src/include/kernels/tree_sort.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * @brief Kernels for sorting tree cells by morton code.
- *
- * @file tree_sort.cu
- * @author Roshan Rao
- * @date 2018-05-08
- * Copyright (c) 2018, Regents of the University of California
- */
-#ifndef SRC_INCLUDE_KERNELS_TREE_SORT_H_
-#define SRC_INCLUDE_KERNELS_TREE_SORT_H_
-
-#include "include/common.h"
-#include "include/options.h"
-#include "include/util/cuda_utils.h"
-
-//TSNE-Vars
-extern __device__ volatile int stepd, bottomd, maxdepthd;
-extern __device__ unsigned int blkcntd;
-extern __device__ volatile float radiusd;
-
-namespace tsnecuda {
-namespace bh {
-__global__
-void SortKernel(int * __restrict__ cell_sorted, 
-                              volatile int * __restrict__ cell_starts, 
-                              int * __restrict__ children,
-                              const int * __restrict__ cell_counts, 
-                              const int num_nodes,
-                              const int num_points);
-
-void SortCells(tsnecuda::GpuOptions &gpu_opt,
-                thrust::device_vector<int> &cell_sorted,
-                thrust::device_vector<int> &cell_starts,
-                thrust::device_vector<int> &children,
-                thrust::device_vector<int> &cell_counts,
-                const int num_nodes,
-                const int num_points,
-                const int num_blocks);
-}
-}
-
-#endif
diff --git a/src/include/kernels/tree_summary.h b/src/include/kernels/tree_summary.h
deleted file mode 100644
index b12a7c8..0000000
--- a/src/include/kernels/tree_summary.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * @brief Kernels for computing summary of a quad tree.
- *
- * @file apply_forces.cu
- * @author Roshan Rao
- * @date 2018-05-08
- * Copyright (c) 2018, Regents of the University of California
- */
-#ifndef SRC_INCLUDE_KERNELS_TREE_SUMMARY_H_
-#define SRC_INCLUDE_KERNELS_TREE_SUMMARY_H_
-
-#include "include/common.h"
-#include "include/options.h"
-#include "include/util/cuda_utils.h"
-
-//TSNE-Vars
-extern __device__ volatile int stepd, bottomd, maxdepthd;
-extern __device__ unsigned int blkcntd;
-extern __device__ volatile float radiusd;
-
-namespace tsnecuda {
-namespace bh {
-__global__
-void SummarizationKernel(
-                               volatile int * __restrict cell_counts, 
-                               volatile float * __restrict cell_mass, 
-                               volatile float * __restrict x_pos_device, 
-                               volatile float * __restrict y_pos_device,
-                               const int * __restrict children,
-                               const int num_nodes,
-                               const int num_points,
-                               const int gpu_summary_threads);
-
-void SummarizeTree(tsnecuda::GpuOptions &gpu_opt,
-                        thrust::device_vector<int> &cell_counts,
-                        thrust::device_vector<int> &children,
-                        thrust::device_vector<float> &cell_mass,
-                        thrust::device_vector<float> &pts_device,
-                        const int num_nodes,
-                        const int num_points,
-                        const int num_blocks);
-}
-}
-
-#endif
diff --git a/src/include/naive_tsne.h b/src/include/naive_tsne.h
deleted file mode 100644
index 1101c1f..0000000
--- a/src/include/naive_tsne.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/**
- * @brief Naive implementation of T-SNE O(n^2)
- * 
- * @file naive_tsne.h
- * @author David Chan
- * @date 2018-04-04
- */
-
-#ifndef NAIVE_TSNE_H
-#define NAIVE_TSNE_H
-
-#include "common.h"
-#include "util/cuda_utils.h"
-#include "util/math_utils.h"
-#include "util/matrix_broadcast_utils.h"
-#include "util/reduce_utils.h"
-#include "util/distance_utils.h"
-#include "util/random_utils.h"
-#include "util/thrust_utils.h"
-#include "include/util/thrust_transform_functions.h"
-#include "include/util/debug_utils.h"
-
-namespace NaiveTSNE {
-    /**
-     * @brief Compute the Pij distribution O(n^2)
-     * 
-     * @param handle CUBLAS handle
-     * @param points The points in an NxNDIM column-major array
-     * @param sigma The list of sigmas for each point
-     * @param pij The computed pij value
-     * @param N The number of points
-     * @param NDIMS The number of dimensions for each point
-     */
-    void compute_pij(cublasHandle_t &handle, 
-                thrust::device_vector<float> &pij,  
-                const thrust::device_vector<float> &points, 
-                const thrust::device_vector<float> &sigma,
-                const unsigned int N, 
-                const unsigned int NDIMS);
-    /**
-     * @brief Compute the Pij based on P(i|j) using Pij = P(i|j) + P(j|i)/2N O(n^2)
-     * 
-     * @param handle CUBLAS handle
-     * @param pij_vals Computed P(i|j) values
-     * @param N The number of points
-     */
-    void symmetrize_pij(cublasHandle_t &handle, 
-                            thrust::device_vector<float> &pij, 
-                            const unsigned int N);
-
-    /**
-     * @brief Searches the right sigmas for computing pij
-     * 
-     * @param handle CUBLAS handle
-     * @param points Original Points
-     * @param perplexity_target Target perplexity for the Search
-     * @param eps Error tolerance for the perplexity search
-     * @param N The number of points
-     * @param NDIMS Number of Dimensions of points
-     * @return thrust::device_vector<float> Computed Sigmas based on the perplexity target
-     */
-    thrust::device_vector<float> search_perplexity(cublasHandle_t &handle,
-                        thrust::device_vector<float> &points,
-                        const float perplexity_target,
-                        const float eps,
-                        const unsigned int N,
-                        const unsigned int NDIMS);
-
-    ///@private
-    void thrust_search_perplexity(cublasHandle_t &handle,
-                        thrust::device_vector<float> &sigmas,
-                        thrust::device_vector<float> &lower_bound,
-                        thrust::device_vector<float> &upper_bound,
-                        thrust::device_vector<float> &perplexity,
-                        const thrust::device_vector<float> &pij,
-                        const float target_perplexity,
-                        const unsigned int N);
-
-    /**
-     * @brief Compute the T-SNE gradients 
-     * 
-     * @param handle CUBLAS handle
-     * @param forces The forces output array
-     * @param dist Placeholder array for the distances
-     * @param ys The current projected points
-     * @param pij The P(i|j) distribution
-     * @param qij Placeholder for the computed Q(i|j) distribution
-     * @param N The number of points
-     * @param PROJDIM The number of dimensions to project into
-     * @param eta The learning rate
-     * @return float Returns the K-L Divergence of the two distributions (as the loss)
-     */
-    float compute_gradients(cublasHandle_t &handle, 
-                            thrust::device_vector<float> &forces,
-                            thrust::device_vector<float> &dist, 
-                            thrust::device_vector<float> &ys, 
-                            thrust::device_vector<float> &pij, 
-                            thrust::device_vector<float> &qij, 
-                            const unsigned int N,
-                            const unsigned int PROJDIM,
-                            float eta);
-
-    /**
-     * @brief Perform T-SNE using the naive O(n^2) forces computation
-     * 
-     * @param handle The CUBLAS handle to use
-     * @param points The array of points in column-major NxNDIM matrix
-     * @param N The number of points
-     * @param NDIMS The number of dimentions the original points are in
-     * @param PROJDIM The number of dimensions to project to
-     * @return thrust::device_vector<float> The projected points
-     */
-    thrust::device_vector<float> tsne(cublasHandle_t &handle, 
-                                        thrust::device_vector<float> &points, 
-                                        const unsigned int N, 
-                                        const unsigned int NDIMS,
-                                        const unsigned int PROJDIM);
-
-    /**
-     * @brief Perform T-SNE using the naive O(n^2) forces method with some
-     * additional parameters
-     * 
-     * @param handle The CUBLAS handle to use
-     * @param d_points  The array of points in coloumn-major NxNDIM matrix
-     * @param N_POINTS  The number of points that we're using
-     * @param N_DIMS  The number of dimensions that the original points are in
-     * @param proj_dim  The number of dimensions we're projecting to
-     * @param perplexity The perplexity that we're shooting for in our distribution
-     * @param early_ex The amount of early exaggeration to use
-     * @param learning_rate The learning rate to use
-     * @param n_iter The number of iterations to perform
-     * @param n_iter_np The number of iterations to go without progress before early termination 
-     * @param min_g_norm  The norm of the forces to keep above
-     * @return thrust::device_vector<float> The projected points
-     */
-    thrust::device_vector<float> tsne(cublasHandle_t &handle, 
-                                        thrust::device_vector<float> &d_points, 
-                                        unsigned int N_POINTS, 
-                                        unsigned int N_DIMS, 
-                                        unsigned int proj_dim, 
-                                        float perplexity, 
-                                        float early_ex, 
-                                        float learning_rate, 
-                                        unsigned int n_iter, 
-                                        unsigned int n_iter_np, 
-                                        float min_g_norm);
-}
-
-#endif
diff --git a/src/include/naive_tsne_cpu.h b/src/include/naive_tsne_cpu.h
deleted file mode 100644
index 60a8362..0000000
--- a/src/include/naive_tsne_cpu.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/**
- * @brief Naive implementation of T-SNE O(n^2) on CPU
- * 
- * @file naive_tsne_cpu.h
- * @author Forrest Huang
- * @date 2018-04-11
- */
-
-#ifndef NAIVE_TSNE_CPU_H
-#define NAIVE_TSNE_CPU_H
-
-#include "common.h"
-#include "util/cuda_utils.h"
-#include "util/math_utils.h"
-#include "util/matrix_broadcast_utils.h"
-#include "util/reduce_utils.h"
-#include "util/distance_utils.h"
-#include "util/random_utils.h"
-
-
-std::vector<float> squared_pairwise_dist(std::vector<float> &points, const unsigned int N, const unsigned int NDIMS);
-
-std::vector<float> sigmas_search_cpu(std::vector<float> &points,  
-                             const unsigned int N, 
-                             const unsigned int NDIMS,
-                             float target_perplexity);
-
-bool perplexity_equal(const float delta, float perplexity, float target_perplexity);
-
-bool compare_perplexity(std::vector<float>& pij, 
-             float& lo, 
-             float& mid, 
-             float& high, 
-             const unsigned int i, 
-             const unsigned int N, 
-             const float delta, 
-             const float target_perplexity);
-
-float get_perplexity(std::vector<float> & pij, const unsigned int i, unsigned int N);
-
-void recompute_pij_row_cpu(std::vector<float> &points, 
-                             std::vector<float> &pij, 
-                             float sigma,
-                             float i, 
-                             const unsigned int N, 
-                             const unsigned int NDIMS);
-
-std::vector<float> compute_pij_cpu(std::vector<float> &points, 
-                             std::vector<float> &sigma, 
-                             const unsigned int N, 
-                             const unsigned int NDIMS);
-
-std::vector<float> compute_qij_cpu(std::vector<float>& ys, const unsigned int N, const unsigned int PROJDIMS);
-
-float kl_div(float pij, float qij);
-
-float compute_gradients_cpu(std::vector<float> &forces,
-                      std::vector<float> &dist, 
-                        std::vector<float> &ys, 
-                        std::vector<float> &pij, 
-                        std::vector<float> &qij, 
-                        const unsigned int N,
-                        float eta);
-
-std::vector<float> naive_tsne_cpu(std::vector<float> &points, 
-                              const unsigned int N, 
-                              const unsigned int NDIMS);
-
-
-/*
-std::vector<float> compute_qij_cpu(std::vector<float> &ys,
-                                   const unsigned int N,
-                                   const unsigned int PROJDIM);
-
-float compute_gradients_cpu(std::vector<float> &forces,
-                        std::vector<float> &dist, 
-                        std::vector<float> &ys, 
-                        std::vector<float> &pij, 
-                        std::vector<float> &qij, 
-                        const unsigned int N,
-                        float eta);
-
-std::vector<float> naive_tsne_cpu(std::vector<float> &points, 
-                              const unsigned int N, 
-                              const unsigned int NDIMS);
-*/
-#endif
diff --git a/src/include/nbodyfft.h b/src/include/nbodyfft.h
index 3c5cd93..01154f5 100644
--- a/src/include/nbodyfft.h
+++ b/src/include/nbodyfft.h
@@ -2,26 +2,34 @@
 #define NBODYFFT_H
 
 #include <complex>
-#include "common.h"
+#include <cufft.h>
 #include <thrust/complex.h>
+#include "common.h"
+#include "include/util/cuda_utils.h"
+#include "include/util/matrix_broadcast_utils.h"
 
-using namespace std;
-
-typedef float (*kernel_type)(float, float);
-
-typedef float (*kernel_type_2d)(float, float, float, float);
+namespace tsnecuda {
 
-void precompute_2d(cufftHandle &plan_kernel_tilde, float x_max, float x_min, float y_max, float y_min, int n_boxes, int n_interpolation_points,
-                   thrust::device_vector<float> &box_lower_bounds_device, thrust::device_vector<float> &box_upper_bounds_device, 
-                   thrust::device_vector<float> &kernel_tilde_device, thrust::device_vector<thrust::complex<float>> &fft_kernel_tilde_device);
+void PrecomputeFFT2D(
+        cufftHandle &plan_kernel_tilde,
+        float x_max,
+        float x_min,
+        float y_max,
+        float y_min,
+        int n_boxes,
+        int n_interpolation_points,
+        thrust::device_vector<float> &box_lower_bounds_device,
+        thrust::device_vector<float> &box_upper_bounds_device,
+        thrust::device_vector<float> &kernel_tilde_device,
+        thrust::device_vector<thrust::complex<float> > &fft_kernel_tilde_device);
 
-void n_body_fft_2d(
+void NbodyFFT2D(
     cufftHandle &plan_dft,
     cufftHandle &plan_idft,
-    int N, 
-    int n_terms, 
+    int N,
+    int n_terms,
     int n_boxes,
-    int n_interpolation_points, 
+    int n_interpolation_points,
     thrust::device_vector<thrust::complex<float>> &fft_kernel_tilde_device,
     int n_total_boxes,
     int total_interpolation_points,
@@ -33,7 +41,7 @@ void n_body_fft_2d(
     thrust::device_vector<float> &fft_input,
     thrust::device_vector<thrust::complex<float>> &fft_w_coefficients,
     thrust::device_vector<float> &fft_output,
-    thrust::device_vector<int> &point_box_idx_device, 
+    thrust::device_vector<int> &point_box_idx_device,
     thrust::device_vector<float> &x_in_box_device,
     thrust::device_vector<float> &y_in_box_device,
     thrust::device_vector<float> &points_device,
@@ -51,9 +59,6 @@ void n_body_fft_2d(
     thrust::device_vector<float> &y_interpolated_values_device,
     thrust::device_vector<float> &potentialsQij_device);
 
-void interpolate(int n_interpolation_points, int N, const float *y_in_box, const float *y_tilde_spacings,
-                 float *interpolated_values, const float *denominator);
-
-float* get_ntime();
+}
 
 #endif
diff --git a/src/include/tsne_vars.h b/src/include/tsne_vars.h
deleted file mode 100644
index 537bd3e..0000000
--- a/src/include/tsne_vars.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/**
- * @brief Barnes-hut t-SNE global variables
- *
- * @file apply_forces.cu
- * @author Roshan Rao
- * @date 2018-05-08
- * Copyright (c) 2018, Regents of the University of California
- */
-#ifndef INITIALIZATION_VARS
-#ifndef SRC_INCLUDE_KERNELS_TSNEVARS_H_
-#define SRC_INCLUDE_KERNELS_TSNEVARS_H_
-
-extern __device__ volatile int stepd, bottomd, maxdepthd;
-extern __device__ unsigned int blkcntd;
-extern __device__ volatile float radiusd;
-
-#endif
-#endif
-
-
diff --git a/src/kernels/apply_forces.cu b/src/kernels/apply_forces.cu
index a006223..0160993 100644
--- a/src/kernels/apply_forces.cu
+++ b/src/kernels/apply_forces.cu
@@ -1,9 +1,8 @@
-
 /*
     Apply forces to the points with momentum, exaggeration, etc.
 */
 
-#include "include/kernels/apply_forces.h"
+#include "../include/kernels/apply_forces.h"
 
 
 /******************************************************************************/
@@ -11,7 +10,7 @@
 /******************************************************************************/
 // Edited to add momentum, repulsive, attr forces, etc.
 __global__
-void tsnecuda::bh::IntegrationKernel(
+void tsnecuda::IntegrationKernel(
                                  volatile float * __restrict__ points,
                                  volatile float * __restrict__ attr_forces,
                                  volatile float * __restrict__ rep_forces,
@@ -59,52 +58,11 @@ void tsnecuda::bh::IntegrationKernel(
    }
 }
 
-__global__
-void tsnecuda::naive::IntegrationKernel(
-                                 volatile float * __restrict__ points,
-                                 volatile float * __restrict__ forces,
-                                 volatile float * __restrict__ gains,
-                                 volatile float * __restrict__ old_forces,
-                                 const float eta,
-                                 const float momentum,
-                                 const int num_points)
-{
-  register int i, inc;
-  register float dx, dy, ux, uy, gx, gy;
-
-  // iterate over all bodies assigned to thread
-  inc = blockDim.x * gridDim.x;
-  for (i = threadIdx.x + blockIdx.x * blockDim.x; i < num_points; i += inc) {
-        ux = old_forces[i];
-        uy = old_forces[num_points + i];
-        gx = gains[i];
-        gy = gains[num_points + i];
-        dx = forces[i];
-        dy = forces[num_points + i];
-
-        gx = (signbit(dx) != signbit(ux)) ? gx + 0.2 : gx * 0.8;
-        gy = (signbit(dy) != signbit(uy)) ? gy + 0.2 : gy * 0.8;
-        gx = (gx < 0.01) ? 0.01 : gx;
-        gy = (gy < 0.01) ? 0.01 : gy;
-
-        ux = momentum * ux - eta * gx * dx;
-        uy = momentum * uy - eta * gy * dy;
-
-        points[i] += ux;
-        points[num_points + i] += uy;
-
-        old_forces[i] = ux;
-        old_forces[num_points + i] = uy;
-        gains[i] = gx;
-        gains[num_points + i] = gy;
-   }
-}
-
-void tsnecuda::bh::ApplyForces(tsnecuda::GpuOptions &gpu_opt,
+void tsnecuda::ApplyForces(tsnecuda::GpuOptions &gpu_opt,
                                 thrust::device_vector<float> &points,
                                 thrust::device_vector<float> &attr_forces,
                                 thrust::device_vector<float> &rep_forces,
-                                thrust::device_vector<float> &gains, 
+                                thrust::device_vector<float> &gains,
                                 thrust::device_vector<float> &old_forces,
                                 const float eta,
                                 const float normalization,
@@ -114,8 +72,8 @@ void tsnecuda::bh::ApplyForces(tsnecuda::GpuOptions &gpu_opt,
                                 const int num_points,
                                 const int num_blocks)
 {
-    tsnecuda::bh::IntegrationKernel<<<num_blocks * gpu_opt.integration_kernel_factor,
-                                      gpu_opt.integration_kernel_threads>>>(
+    tsnecuda::IntegrationKernel<<<num_blocks * gpu_opt.integration_kernel_factor,
+                                  gpu_opt.integration_kernel_threads>>>(
                     thrust::raw_pointer_cast(points.data()),
                     thrust::raw_pointer_cast(attr_forces.data()),
                     thrust::raw_pointer_cast(rep_forces.data()),
@@ -125,21 +83,3 @@ void tsnecuda::bh::ApplyForces(tsnecuda::GpuOptions &gpu_opt,
                     num_nodes, num_points);
     GpuErrorCheck(cudaDeviceSynchronize());
 }
-
-void tsnecuda::naive::ApplyForces(thrust::device_vector<float> &points,
-                                  thrust::device_vector<float> &forces,
-                                  thrust::device_vector<float> &gains,
-                                  thrust::device_vector<float> &old_forces,
-                                  const float eta,
-                                  const float momentum,
-                                  const int num_points,
-                                  const int num_blocks)
-{
-    tsnecuda::naive::IntegrationKernel<<<num_blocks * 3, 512>>>(
-                    thrust::raw_pointer_cast(points.data()),
-                    thrust::raw_pointer_cast(forces.data()),
-                    thrust::raw_pointer_cast(gains.data()),
-                    thrust::raw_pointer_cast(old_forces.data()),
-                    eta, momentum, num_points);
-    GpuErrorCheck(cudaDeviceSynchronize());
-}
diff --git a/src/kernels/bh_attr_forces.cu b/src/kernels/attr_forces.cu
similarity index 92%
rename from src/kernels/bh_attr_forces.cu
rename to src/kernels/attr_forces.cu
index 6ebc319..1d83778 100644
--- a/src/kernels/bh_attr_forces.cu
+++ b/src/kernels/attr_forces.cu
@@ -6,10 +6,10 @@
     Attractive force is given by pij*qij.
 */
 
-#include "include/kernels/bh_attr_forces.h"
+#include "../include/kernels/attr_forces.h"
 
 __global__
-void tsnecuda::bh::ComputePijxQijKernel(
+void tsnecuda::ComputePijxQijKernel(
                             float * __restrict__ attr_forces,
                             const float * __restrict__ pij,
                             const float * __restrict__ points,
@@ -34,7 +34,7 @@ void tsnecuda::bh::ComputePijxQijKernel(
     atomicAdd(attr_forces + num_nodes + 1 + i, pijqij * dy);
 }
 
-void tsnecuda::bh::ComputeAttractiveForces(
+void tsnecuda::ComputeAttractiveForces(
                     tsnecuda::GpuOptions &gpu_opt,
                     cusparseHandle_t &handle,
                     cusparseMatDescr_t &descr,
@@ -53,7 +53,7 @@ void tsnecuda::bh::ComputeAttractiveForces(
     // TODO: this is bad style
     const int BLOCKSIZE = 1024;
     const int NBLOCKS = iDivUp(num_nonzero, BLOCKSIZE);
-    tsnecuda::bh::ComputePijxQijKernel<<<NBLOCKS, BLOCKSIZE>>>(
+    tsnecuda::ComputePijxQijKernel<<<NBLOCKS, BLOCKSIZE>>>(
                     thrust::raw_pointer_cast(attr_forces.data()),
                     thrust::raw_pointer_cast(sparse_pij.data()),
                     thrust::raw_pointer_cast(points.data()),
diff --git a/src/kernels/bh_rep_forces.cu b/src/kernels/bh_rep_forces.cu
deleted file mode 100644
index a250f6b..0000000
--- a/src/kernels/bh_rep_forces.cu
+++ /dev/null
@@ -1,171 +0,0 @@
-// TODO: add copyright
-
-/*
-        Traverses the tree and calculates approximate repulsive forces via Barnes-Hut.
-
-        t-SNE repulsive forces are given by qij*qijN(y_i - y_j). This also simultaneously
-        calculates tne normalization constant N.
-
-*/
-
-#include "include/kernels/bh_rep_forces.h"
-
-/******************************************************************************/
-/*** compute force ************************************************************/
-/******************************************************************************/
-
-__global__
-void tsnecuda::bh::ForceCalculationKernel(volatile int * __restrict__ errd,
-                                          volatile float * __restrict__ x_vel_device,
-                                          volatile float * __restrict__ y_vel_device,
-                                          volatile float * __restrict__ normalization_vec_device,
-                                          const int * __restrict__ cell_sorted,
-                                          const int * __restrict__ children,
-                                          const float * __restrict__ cell_mass,
-                                          volatile float * __restrict__ x_pos_device,
-                                          volatile float * __restrict__ y_pos_device,
-                                          const float theta,
-                                          const float epsilon,
-                                          const int num_nodes,
-                                          const int num_points,
-                                          const int maxdepth_bh_tree,
-                                          const int repulsive_force_threads
-                                          )
-{
-    register int i, j, k, n, depth, base, sbase, diff, pd, nd;
-    register float px, py, vx, vy, dx, dy, normsum, tmp, mult;
-    extern __shared__ int force_shared_memory[];
-    int* pos = (int*) force_shared_memory;
-    int* node = (int*) (pos + maxdepth_bh_tree*repulsive_force_threads/32);
-    float* dq = (float*) (node + maxdepth_bh_tree*repulsive_force_threads/32);
-
-    if (0 == threadIdx.x) {
-        dq[0] = (radiusd * radiusd) / (theta * theta); 
-        for (i = 1; i < maxdepthd; i++) {
-                dq[i] = dq[i - 1] * 0.25f; // radius is halved every level of tree so squared radius is quartered
-                dq[i - 1] += epsilon;
-        }
-        dq[i - 1] += epsilon;
-
-        if (maxdepthd > maxdepth_bh_tree) {
-            *errd = maxdepthd;
-        }
-    }
-    __syncthreads();
-
-    if (maxdepthd <= maxdepth_bh_tree) {
-        // figure out first thread in each warp (lane 0)
-        base = threadIdx.x / 32;
-        sbase = base * 32;
-        j = base * maxdepth_bh_tree;
-
-        diff = threadIdx.x - sbase;
-        // make multiple copies to avoid index calculations later
-        if (diff < maxdepth_bh_tree) {
-            dq[diff+j] = dq[diff];
-        }
-        __syncthreads();
-        __threadfence_block();
-
-        // iterate over all bodies assigned to thread
-        for (k = threadIdx.x + blockIdx.x * blockDim.x; k < num_points; k += blockDim.x * gridDim.x) {
-            i = cell_sorted[k];    // get permuted/sorted index
-            // cache position info
-            px = x_pos_device[i];
-            py = y_pos_device[i];
-
-            vx = 0.0f;
-            vy = 0.0f;
-            normsum = 0.0f;
-
-            // initialize iteration stack, i.e., push root node onto stack
-            depth = j;
-            if (sbase == threadIdx.x) {
-                pos[j] = 0;
-                node[j] = num_nodes * 4;
-            }
-
-            do {
-                // stack is not empty
-                pd = pos[depth];
-                nd = node[depth];
-                while (pd < 4) {
-                    // node on top of stack has more children to process
-                    n = children[nd + pd];    // load child pointer
-                    pd++;
-
-                    if (n >= 0) {
-                        dx = px - x_pos_device[n];
-                        dy = py - y_pos_device[n];
-                        tmp = dx*dx + dy*dy + epsilon; // distance squared plus small constant to prevent zeros
-                        #if (CUDART_VERSION >= 9000)
-                            if ((n < num_points) || __all_sync(__activemask(), tmp >= dq[depth])) {    // check if all threads agree that cell is far enough away (or is a body)
-                        #else
-                            if ((n < num_points) || __all(tmp >= dq[depth])) {
-                        #endif
-                                // from bhtsne - sptree.cpp
-                            tmp = 1 / (1 + tmp);
-                            mult = cell_mass[n] * tmp;
-                            normsum += mult;
-                            mult *= tmp;
-                            vx += dx * mult;
-                            vy += dy * mult;
-                        } else {
-                            // push cell onto stack
-                            if (sbase == threadIdx.x) {    // maybe don't push and inc if last child
-                                pos[depth] = pd;
-                                node[depth] = nd;
-                            }
-                            depth++;
-                            pd = 0;
-                            nd = n * 4;
-                        }
-                    } else {
-                        pd = 4;    // early out because all remaining children are also zero
-                    }
-                }
-                depth--;    // done with this level
-            } while (depth >= j);
-
-            if (stepd >= 0) {
-                // update velocity
-                x_vel_device[i] += vx;
-                y_vel_device[i] += vy;
-                normalization_vec_device[i] = normsum - 1.0f; // subtract one for self computation (qii)
-            }
-        }
-    }
-}
-
-void tsnecuda::bh::ComputeRepulsiveForces(tsnecuda::GpuOptions &gpu_opt,
-                                          thrust::device_vector<int> &errd,
-                                          thrust::device_vector<float> &repulsive_forces,
-                                          thrust::device_vector<float> &normalization_vec,
-                                          thrust::device_vector<int> &cell_sorted,
-                                          thrust::device_vector<int> &children,
-                                          thrust::device_vector<float> &cell_mass,
-                                          thrust::device_vector<float> &points,
-                                          const float theta,
-                                          const float epsilon,
-                                          const int num_nodes,
-                                          const int num_points,
-                                          const int num_blocks)
-{
-    tsnecuda::bh::ForceCalculationKernel<<<num_blocks * gpu_opt.repulsive_kernel_factor,
-                                           gpu_opt.repulsive_kernel_threads,
-                                           sizeof(int)*2*32*gpu_opt.repulsive_kernel_threads/32 +
-                                           sizeof(float)*32*gpu_opt.repulsive_kernel_threads/32>>>(
-                        thrust::raw_pointer_cast(errd.data()),
-                        thrust::raw_pointer_cast(repulsive_forces.data()),
-                        thrust::raw_pointer_cast(repulsive_forces.data() + num_nodes + 1),
-                        thrust::raw_pointer_cast(normalization_vec.data()),
-                        thrust::raw_pointer_cast(cell_sorted.data()),
-                        thrust::raw_pointer_cast(children.data()),
-                        thrust::raw_pointer_cast(cell_mass.data()),
-                        thrust::raw_pointer_cast(points.data()),
-                        thrust::raw_pointer_cast(points.data() + num_nodes + 1),
-                        theta, epsilon, num_nodes, num_points, 32, //TODO: Encode this variable somewhere
-                        gpu_opt.repulsive_kernel_threads
-                    );
-    GpuErrorCheck(cudaDeviceSynchronize());
-}
diff --git a/src/kernels/bounding_box.cu b/src/kernels/bounding_box.cu
deleted file mode 100644
index af0793e..0000000
--- a/src/kernels/bounding_box.cu
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
-        Computes bounding box for all points for barnes hut. Also resets start, child, mass
-*/
-
-#include "include/kernels/bounding_box.h"
-
-/******************************************************************************/
-/*** compute center and radius ************************************************/
-/******************************************************************************/
-
-__global__
-void tsnecuda::bh::BoundingBoxKernel(
-                       volatile int * __restrict__ cell_starts, 
-                       volatile int * __restrict__ children, 
-                       volatile float * __restrict__ cell_mass, 
-                       volatile float * __restrict__ x_pos_device, 
-                       volatile float * __restrict__ y_pos_device, 
-                       volatile float * __restrict__ x_max_device, 
-                       volatile float * __restrict__ y_max_device, 
-                       volatile float * __restrict__ x_min_device, 
-                       volatile float * __restrict__ y_min_device,
-                       const int num_nodes,
-                       const int num_points,
-                       const int bounding_box_threads) 
-{
-    register int i, j, k, inc;
-    register float val, minx, maxx, miny, maxy;
-    extern __shared__ float bounding_shared_memory[];
-    float* x_min_shared = bounding_shared_memory;
-    float* x_max_shared = x_min_shared + bounding_box_threads;
-    float* y_min_shared = x_max_shared + bounding_box_threads;
-    float* y_max_shared = y_min_shared + bounding_box_threads;
-   
-    // initialize with valid data (in case #bodies < #threads)
-    minx = maxx = x_pos_device[0];
-    miny = maxy = y_pos_device[0];
-
-    // scan all bodies
-    i = threadIdx.x;
-    inc = bounding_box_threads * gridDim.x;
-    for (j = i + blockIdx.x * bounding_box_threads; j < num_points; j += inc) {
-        val = x_pos_device[j];
-        minx = fminf(minx, val);
-        maxx = fmaxf(maxx, val);
-        val = y_pos_device[j];
-        miny = fminf(miny, val);
-        maxy = fmaxf(maxy, val);
-    }
-
-    // reduction in shared memory
-    x_min_shared[i] = minx;
-    x_max_shared[i] = maxx;
-    y_min_shared[i] = miny;
-    y_max_shared[i] = maxy;
-
-    for (j = bounding_box_threads / 2; j > 0; j /= 2) {
-        __syncthreads();
-        if (i < j) {
-            k = i + j;
-            x_min_shared[i] = minx = fminf(minx, x_min_shared[k]);
-            x_max_shared[i] = maxx = fmaxf(maxx, x_max_shared[k]);
-            y_min_shared[i] = miny = fminf(miny, y_min_shared[k]);
-            y_max_shared[i] = maxy = fmaxf(maxy, y_max_shared[k]);
-        }
-    }
-
-    // write block result to global memory
-    if (i == 0) {
-        k = blockIdx.x;
-        x_min_device[k] = minx;
-        x_max_device[k] = maxx;
-        y_min_device[k] = miny;
-        y_max_device[k] = maxy;
-        __threadfence();
-
-        inc = gridDim.x - 1;
-        if (inc == atomicInc(&blkcntd, inc)) {
-            // I'm the last block, so combine all block results
-            for (j = 0; j <= inc; j++) {
-                minx = fminf(minx, x_min_device[j]);
-                maxx = fmaxf(maxx, x_max_device[j]);
-                miny = fminf(miny, y_min_device[j]);
-                maxy = fmaxf(maxy, y_max_device[j]);
-            }
-
-            // compute 'radius'
-            radiusd = fmaxf(maxx - minx, maxy - miny) * 0.5f + 1e-5f;
-
-            // create root node
-            k = num_nodes;
-            bottomd = k;
-
-            cell_mass[k] = -1.0f;
-            cell_starts[k] = 0;
-            x_pos_device[k] = (minx + maxx) * 0.5f;
-            y_pos_device[k] = (miny + maxy) * 0.5f;
-            k *= 4;
-            for (i = 0; i < 4; i++) children[k + i] = -1;
-
-            stepd++;
-        }
-    }
-}
-
-void tsnecuda::bh::ComputeBoundingBox(tsnecuda::GpuOptions &gpu_opt,
-                                      thrust::device_vector<int> &cell_starts,
-                                      thrust::device_vector<int> &children,
-                                      thrust::device_vector<float> &cell_mass,
-                                      thrust::device_vector<float> &points,
-                                      thrust::device_vector<float> &x_max_device,
-                                      thrust::device_vector<float> &y_max_device,
-                                      thrust::device_vector<float> &x_min_device,
-                                      thrust::device_vector<float> &y_min_device,
-                                      const int num_nodes,
-                                      const int num_points,
-                                      const int num_blocks)
-{
-    tsnecuda::bh::BoundingBoxKernel<<<num_blocks * gpu_opt.bounding_kernel_factor,
-                                      gpu_opt.bounding_kernel_threads,
-                                      sizeof(float)*4*gpu_opt.bounding_kernel_threads>>>(
-                                                          thrust::raw_pointer_cast(cell_starts.data()),
-                                                          thrust::raw_pointer_cast(children.data()),
-                                                          thrust::raw_pointer_cast(cell_mass.data()),
-                                                          thrust::raw_pointer_cast(points.data()),
-                                                          thrust::raw_pointer_cast(points.data() + num_nodes + 1),
-                                                          thrust::raw_pointer_cast(x_max_device.data()),
-                                                          thrust::raw_pointer_cast(y_max_device.data()),
-                                                          thrust::raw_pointer_cast(x_min_device.data()),
-                                                          thrust::raw_pointer_cast(y_min_device.data()),
-                                                          num_nodes, num_points,
-                                                          gpu_opt.bounding_kernel_threads);
-    GpuErrorCheck(cudaDeviceSynchronize());
-}
diff --git a/src/kernels/initialization.cu b/src/kernels/initialization.cu
index 31205bc..b7d4259 100644
--- a/src/kernels/initialization.cu
+++ b/src/kernels/initialization.cu
@@ -1,18 +1,13 @@
-#include "include/kernels/initialization.h"
+#include "../include/kernels/initialization.h"
 
 /******************************************************************************/
 /*** initialize memory ********************************************************/
 /******************************************************************************/
 
 
-void tsnecuda::bh::Initialize(tsnecuda::GpuOptions &gpu_opt, thrust::device_vector<int> &errd)
+void tsnecuda::Initialize(tsnecuda::GpuOptions &gpu_opt, thrust::device_vector<int> &errd)
 {
     cudaFuncSetCacheConfig(tsnecuda::bh::IntegrationKernel, cudaFuncCachePreferL1);
     cudaFuncSetCacheConfig(tsnecuda::bh::ComputePijxQijKernel, cudaFuncCachePreferShared);
     GpuErrorCheck(cudaDeviceSynchronize());
 }
-
-void tsnecuda::naive::Initialize()
-{
-    // TODO: Add cache config sets for naive
-}
diff --git a/src/kernels/perplexity_search.cu b/src/kernels/perplexity_search.cu
index 413e98d..884979f 100644
--- a/src/kernels/perplexity_search.cu
+++ b/src/kernels/perplexity_search.cu
@@ -5,10 +5,10 @@
     Note that FAISS returns the first row as the same point, with distance = 0. pii is defined as zero.
 */
 
-#include "include/kernels/perplexity_search.h"
+#include "../include/kernels/perplexity_search.h"
 
 __global__
-void tsnecuda::bh::ComputePijKernel(
+void tsnecuda::ComputePijKernel(
                       volatile float * __restrict__ pij,
                       const float * __restrict__ squared_dist,
                       const float * __restrict__ betas,
@@ -29,31 +29,7 @@ void tsnecuda::bh::ComputePijKernel(
 
     // condition deals with evaluation of pii
     // FAISS neighbor zero is i so ignore it
-    pij[TID] = (j == 0 & dist == 0.0f) ? 0.0f : __expf(-beta * dist); 
-}
-
-__global__
-void tsnecuda::naive::ComputePijKernel(
-                        volatile float * __restrict__ pij,
-                        const float * __restrict__ squared_dist,
-                        const float * __restrict__ betas,
-                        const unsigned int num_points)
-{
-    register int TID, i, j;
-    register float dist, beta;
-
-    TID = threadIdx.x + blockIdx.x * blockDim.x;
-    if (TID >= num_points * num_points) return;
-
-    i = TID / num_points;
-    j = TID % num_points;
-    
-    beta = betas[i];
-    dist = squared_dist[TID];
-
-    // condition deals with evaluation of pii
-    // FAISS neighbor zero is i so ignore it
-    pij[TID] = (j == i & dist == 0.0f) ? 0.0f : __expf(-beta * dist); 
+    pij[TID] = (j == 0 & dist == 0.0f) ? 0.0f : __expf(-beta * dist);
 }
 
 __global__
@@ -98,7 +74,7 @@ void tsnecuda::PerplexitySearchKernel(
     found[i] = is_found;
 }
 
-void tsnecuda::bh::SearchPerplexity(tsnecuda::GpuOptions &gpu_opt,
+void tsnecuda::SearchPerplexity(tsnecuda::GpuOptions &gpu_opt,
                                      cublasHandle_t &handle,
                                      thrust::device_vector<float> &pij,
                                      thrust::device_vector<float> &squared_dist,
@@ -127,13 +103,13 @@ void tsnecuda::bh::SearchPerplexity(tsnecuda::GpuOptions &gpu_opt,
     thrust::device_vector<float> row_sum, neg_entropy;
     do {
         // compute Gaussian Kernel row
-        tsnecuda::bh::ComputePijKernel<<<NBLOCKS1, BLOCKSIZE1>>>(
+        tsnecuda::ComputePijKernel<<<NBLOCKS1, BLOCKSIZE1>>>(
                         thrust::raw_pointer_cast(pij.data()),
                         thrust::raw_pointer_cast(squared_dist.data()),
                         thrust::raw_pointer_cast(betas.data()),
                         num_points, num_near_neighbors);
         GpuErrorCheck(cudaDeviceSynchronize());
-        
+
         // compute entropy of current row
         row_sum = tsnecuda::util::ReduceSum(handle, pij, num_near_neighbors, num_points, 0);
         thrust::transform(pij.begin(), pij.end(), entropy.begin(), tsnecuda::util::FunctionalEntropy());
@@ -158,63 +134,3 @@ void tsnecuda::bh::SearchPerplexity(tsnecuda::GpuOptions &gpu_opt,
 
     tsnecuda::util::BroadcastMatrixVector(pij, row_sum, num_near_neighbors, num_points, thrust::divides<float>(), 1, 1.0f);
 }
-
-void tsnecuda::naive::SearchPerplexity(
-                                     cublasHandle_t &handle,
-                                     thrust::device_vector<float> &pij,
-                                     thrust::device_vector<float> &squared_dist,
-                                     const float perplexity_target,
-                                     const float epsilon,
-                                     const int num_points)
-{
-    // use beta instead of sigma (this matches the bhtsne code but not the paper)
-    // beta is just multiplicative instead of divisive (changes the way binary search works)
-    thrust::device_vector<float> betas(num_points, 1.0f);
-    thrust::device_vector<float> lower_bound_beta(num_points, -FLT_MAX);
-    thrust::device_vector<float> upper_bound_beta(num_points, FLT_MAX);
-    thrust::device_vector<float> entropy(num_points * num_points);
-    thrust::device_vector<int> found(num_points);
-
-    // TODO: this doesn't really fit with the style
-    const int BLOCKSIZE1 = 1024;
-    const int NBLOCKS1 = iDivUp(num_points * num_points, BLOCKSIZE1);
-
-    const int BLOCKSIZE2 = 128;
-    const int NBLOCKS2 = iDivUp(num_points, BLOCKSIZE2);
-
-    size_t iters = 0;
-    int all_found = 0;
-    thrust::device_vector<float> row_sum, neg_entropy;
-    do {
-        // compute Gaussian Kernel row
-        tsnecuda::naive::ComputePijKernel<<<NBLOCKS1, BLOCKSIZE1>>>(
-                        thrust::raw_pointer_cast(pij.data()),
-                        thrust::raw_pointer_cast(squared_dist.data()),
-                        thrust::raw_pointer_cast(betas.data()),
-                        num_points);
-        GpuErrorCheck(cudaDeviceSynchronize());
-        
-        // compute entropy of current row
-        row_sum = tsnecuda::util::ReduceSum(handle, pij, num_points, num_points, 0);
-        thrust::transform(pij.begin(), pij.end(), entropy.begin(), tsnecuda::util::FunctionalEntropy());
-        neg_entropy = tsnecuda::util::ReduceAlpha(handle, entropy, num_points, num_points, -1.0f, 0);
-
-        // binary search for beta
-        tsnecuda::PerplexitySearchKernel<<<NBLOCKS2, BLOCKSIZE2>>>(
-                                                            thrust::raw_pointer_cast(betas.data()),
-                                                            thrust::raw_pointer_cast(lower_bound_beta.data()),
-                                                            thrust::raw_pointer_cast(upper_bound_beta.data()),
-                                                            thrust::raw_pointer_cast(found.data()),
-                                                            thrust::raw_pointer_cast(neg_entropy.data()),
-                                                            thrust::raw_pointer_cast(row_sum.data()),
-                                                            perplexity_target, epsilon, num_points);
-        GpuErrorCheck(cudaDeviceSynchronize());
-
-        // Check if searching is done
-        all_found = thrust::reduce(found.begin(), found.end(), 1, thrust::minimum<int>());
-        iters++;
-    } while (!all_found && iters < 200);
-    // TODO: Warn if iters == 200 because perplexity not found?
-
-    tsnecuda::util::BroadcastMatrixVector(pij, row_sum, num_points, num_points, thrust::divides<float>(), 1, 1.0f);
-}
diff --git a/src/kernels/tree_builder.cu b/src/kernels/tree_builder.cu
deleted file mode 100644
index 3e3c8df..0000000
--- a/src/kernels/tree_builder.cu
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
-        Builds a quad tree for computing Barnes-Hut approximation of t-SNE repulsive forces.
-*/
-
-#include "include/kernels/tree_builder.h"
-
-/******************************************************************************/
-/*** build tree ***************************************************************/
-/******************************************************************************/
-
-__global__
-void tsnecuda::bh::ClearKernel1(volatile int * __restrict__ children, const int num_nodes, const int num_points)
-{
-    register int k, inc, top, bottom;
-
-    top = 4 * num_nodes;
-    bottom = 4 * num_points;
-    inc = blockDim.x * gridDim.x;
-    k = (bottom & (-32)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size
-    if (k < bottom) k += inc;
-
-    // iterate over all cells assigned to thread
-    while (k < top) {
-        children[k] = -1;
-        k += inc;
-    }
-}
-
-
-__global__
-void tsnecuda::bh::TreeBuildingKernel(volatile int * __restrict__ errd, 
-                                        volatile int * __restrict__ children, 
-                                        volatile float * __restrict__ x_pos_device, 
-                                        volatile float * __restrict__ y_pos_device,
-                                        const int num_nodes,
-                                        const int num_points) 
-{
-    register int i, j, depth, localmaxdepth, skip, inc;
-    register float x, y, r;
-    register float px, py;
-    register float dx, dy;
-    register int ch, n, cell, locked, patch;
-    register float radius, rootx, rooty;
-
-    // cache root data
-    radius = radiusd;
-    rootx = x_pos_device[num_nodes];
-    rooty = y_pos_device[num_nodes];
-
-    localmaxdepth = 1;
-    skip = 1;
-    inc = blockDim.x * gridDim.x;
-    i = threadIdx.x + blockIdx.x * blockDim.x;
-
-    // iterate over all bodies assigned to thread
-    while (i < num_points) {
-        if (skip != 0) {
-          // new body, so start traversing at root
-          skip = 0;
-          px = x_pos_device[i];
-          py = y_pos_device[i];
-          n = num_nodes;
-          depth = 1;
-          r = radius * 0.5f;
-          dx = dy = -r;
-          j = 0;
-          // determine which child to follow
-          if (rootx < px) {j = 1; dx = r;}
-          if (rooty < py) {j |= 2; dy = r;}
-          x = rootx + dx;
-          y = rooty + dy;
-        }
-
-        // follow path to leaf cell
-        ch = children[n*4+j];
-        while (ch >= num_points) {
-          n = ch;
-          depth++;
-          r *= 0.5f;
-          dx = dy = -r;
-          j = 0;
-          // determine which child to follow
-          if (x < px) {j = 1; dx = r;}
-          if (y < py) {j |= 2; dy = r;}
-          x += dx;
-          y += dy;
-          ch = children[n*4+j];
-        }
-        if (ch != -2) {  // skip if child pointer is locked and try again later
-          locked = n*4+j;
-          if (ch == -1) {
-            if (-1 == atomicCAS((int *)&children[locked], -1, i)) {  // if null, just insert the new body
-              localmaxdepth = max(depth, localmaxdepth);
-              i += inc;  // move on to next body
-              skip = 1;
-            }
-          } else {  // there already is a body in this position
-            if (ch == atomicCAS((int *)&children[locked], ch, -2)) {  // try to lock
-              patch = -1;
-              // create new cell(s) and insert the old and new body
-              do {
-                depth++;
-
-                cell = atomicSub((int *)&bottomd, 1) - 1;
-                if (cell <= num_points) {
-                  *errd = 1;
-                  bottomd = num_nodes;
-                }
-
-                if (patch != -1) {
-                  children[n*4+j] = cell;
-                }
-                patch = max(patch, cell);
-                j = 0;
-                if (x < x_pos_device[ch]) j = 1;
-                if (y < y_pos_device[ch]) j |= 2;
-                children[cell*4+j] = ch;
-                n = cell;
-                r *= 0.5f;
-                dx = dy = -r;
-                j = 0;
-                if (x < px) {j = 1; dx = r;}
-                if (y < py) {j |= 2; dy = r;}
-                x += dx;
-                y += dy;
-                ch = children[n*4+j];
-                // repeat until the two bodies are different children
-              } while (ch >= 0 && r > 1e-10); // add radius check because bodies that are very close together can cause this to fail... if points are too close together it will exceed the max depth of the tree 
-              children[n*4+j] = i;
-
-              localmaxdepth = max(depth, localmaxdepth);
-              i += inc;  // move on to next body
-              skip = 2;
-            }
-          }
-        }
-        __threadfence();
-
-        if (skip == 2) {
-          children[locked] = patch;
-        }
-    }
-    // record maximum tree depth
-    atomicMax((int *)&maxdepthd, localmaxdepth);
-}
-
-
-__global__
-void tsnecuda::bh::ClearKernel2(volatile int * __restrict__ cell_starts, volatile float * __restrict__ cell_mass, const int num_nodes)
-{
-    register int k, inc, bottom;
-
-    bottom = bottomd;
-    inc = blockDim.x * gridDim.x;
-    k = (bottom & (-32)) + threadIdx.x + blockIdx.x * blockDim.x;  // align to warp size
-    if (k < bottom) k += inc;
-
-    // iterate over all cells assigned to thread
-    while (k < num_nodes) {
-        cell_mass[k] = -1.0f;
-        cell_starts[k] = -1;
-        k += inc;
-    }
-}
-
-void tsnecuda::bh::BuildTree(tsnecuda::GpuOptions &gpu_opt,
-                               thrust::device_vector<int> &errd,
-                               thrust::device_vector<int> &children,
-                               thrust::device_vector<int> &cell_starts,
-                               thrust::device_vector<float> &cell_mass,
-                               thrust::device_vector<float> &points,
-                               const int num_nodes,
-                               const int num_points,
-                               const int num_blocks)
-{
-    tsnecuda::bh::ClearKernel1<<<num_blocks, 1024>>>(thrust::raw_pointer_cast(children.data()),
-                                                       num_nodes, num_points);
-    tsnecuda::bh::TreeBuildingKernel<<<num_blocks * gpu_opt.tree_kernel_factor,
-                                       gpu_opt.tree_kernel_threads>>>(
-                                                                  thrust::raw_pointer_cast(errd.data()),
-                                                                  thrust::raw_pointer_cast(children.data()),
-                                                                  thrust::raw_pointer_cast(points.data()),
-                                                                  thrust::raw_pointer_cast(points.data() + num_nodes + 1),
-                                                                  num_nodes, num_points);
-    tsnecuda::bh::ClearKernel2<<<num_blocks, 1024>>>(thrust::raw_pointer_cast(cell_starts.data()),
-                                                       thrust::raw_pointer_cast(cell_mass.data()),
-                                                       num_nodes);
-    GpuErrorCheck(cudaDeviceSynchronize());
-}
diff --git a/src/kernels/tree_sort.cu b/src/kernels/tree_sort.cu
deleted file mode 100644
index 67235c8..0000000
--- a/src/kernels/tree_sort.cu
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
-        Sort points and cells by morton code.
-*/
-
-#include "include/kernels/tree_sort.h"
-
-/******************************************************************************/
-/*** sort bodies **************************************************************/
-/******************************************************************************/
-
-__global__
-void tsnecuda::bh::SortKernel(int * __restrict__ cell_sorted, 
-                              volatile int * __restrict__ cell_starts, 
-                              int * __restrict__ children,
-                              const int * __restrict__ cell_counts, 
-                              const int num_nodes,
-                              const int num_points)
-{
-    register int i, j, k, ch, dec, start, bottom;
-
-    bottom = bottomd;
-    dec = blockDim.x * gridDim.x;
-    k = num_nodes + 1 - dec + threadIdx.x + blockIdx.x * blockDim.x;
-
-    // iterate over all cells assigned to thread
-    while (k >= bottom) {
-        start = cell_starts[k];
-        if (start >= 0) {
-            j = 0;
-            for (i = 0; i < 4; i++) {
-                ch = children[k*4+i];
-                if (ch >= 0) {
-                    if (i != j) {
-                        // move children to front (needed later for speed)
-                        children[k*4+i] = -1;
-                        children[k*4+j] = ch;
-                    }
-                    j++;
-                    if (ch >= num_points) {
-                        // child is a cell
-                        cell_starts[ch] = start;    // set start ID of child
-                        start += cell_counts[ch];    // add #bodies in subtree
-                    } else {
-                        // child is a body
-                        cell_sorted[start] = ch;    // record body in 'sorted' array
-                        start++;
-                    }
-                }
-            }
-            k -= dec;    // move on to next cell
-        }
-    }
-}
-
-void tsnecuda::bh::SortCells(tsnecuda::GpuOptions &gpu_opt,
-                             thrust::device_vector<int> &cell_sorted,
-                             thrust::device_vector<int> &cell_starts,
-                             thrust::device_vector<int> &children,
-                             thrust::device_vector<int> &cell_counts,
-                             const int num_nodes,
-                             const int num_points,
-                             const int num_blocks)
-{
-    tsnecuda::bh::SortKernel<<<num_blocks * gpu_opt.sort_kernel_factor,
-                               gpu_opt.sort_kernel_threads>>>(
-                                thrust::raw_pointer_cast(cell_sorted.data()),
-                                thrust::raw_pointer_cast(cell_starts.data()),
-                                thrust::raw_pointer_cast(children.data()),
-                                thrust::raw_pointer_cast(cell_counts.data()),
-                                num_nodes, num_points);
-    GpuErrorCheck(cudaDeviceSynchronize());
-}
diff --git a/src/kernels/tree_summary.cu b/src/kernels/tree_summary.cu
deleted file mode 100644
index 0f646d1..0000000
--- a/src/kernels/tree_summary.cu
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
-        Summarize position and mass of cells in quad-tree.
-*/
-
-#include "include/kernels/tree_summary.h"
-
-/******************************************************************************/
-/*** compute center of mass ***************************************************/
-/******************************************************************************/
-
-__global__
-void tsnecuda::bh::SummarizationKernel(
-                               volatile int * __restrict cell_counts, 
-                               volatile float * __restrict cell_mass, 
-                               volatile float * __restrict x_pos_device, 
-                               volatile float * __restrict y_pos_device,
-                               const int * __restrict children,
-                               const int num_nodes,
-                               const int num_points,
-                               const int gpu_summary_threads) 
-{
-    register int i, j, k, ch, inc, cnt, bottom, flag;
-    register float m, cm, px, py;
-    extern __shared__ int summary_shared_memory[];
-    int* child = (int*) summary_shared_memory;
-    float* mass = (float*) (child + 4*gpu_summary_threads);
-
-    bottom = bottomd;
-    inc = blockDim.x * gridDim.x;
-    k = (bottom & (-32)) + threadIdx.x + blockIdx.x * blockDim.x;    // align to warp size
-    if (k < bottom) k += inc;
-
-    register int restart = k;
-    for (j = 0; j < 5; j++) {    // wait-free pre-passes
-        // iterate over all cells assigned to thread
-        while (k <= num_nodes) {
-            if (cell_mass[k] < 0.0f) {
-                for (i = 0; i < 4; i++) {
-                    ch = children[k*4+i];
-                    child[i*gpu_summary_threads+threadIdx.x] = ch;    // cache children
-                    if ((ch >= num_points) && ((mass[i*gpu_summary_threads+threadIdx.x] = cell_mass[ch]) < 0.0f)) {
-                        break;
-                    }
-                }
-                if (i == 4) {
-                    // all children are ready
-                    cm = 0.0f;
-                    px = 0.0f;
-                    py = 0.0f;
-                    cnt = 0;
-                    for (i = 0; i < 4; i++) {
-                        ch = child[i*gpu_summary_threads+threadIdx.x];
-                        if (ch >= 0) {
-                            if (ch >= num_points) {    // count bodies (needed later)
-                                m = mass[i*gpu_summary_threads+threadIdx.x];
-                                cnt += cell_counts[ch];
-                            } else {
-                                m = cell_mass[ch];
-                                cnt++;
-                            }
-                            // add child's contribution
-                            cm += m;
-                            px += x_pos_device[ch] * m;
-                            py += y_pos_device[ch] * m;
-                        }
-                    }
-                    cell_counts[k] = cnt;
-                    m = 1.0f / cm;
-                    x_pos_device[k] = px * m;
-                    y_pos_device[k] = py * m;
-                    __threadfence();    // make sure data are visible before setting mass
-                    cell_mass[k] = cm;
-                }
-            }
-            k += inc;    // move on to next cell
-        }
-        k = restart;
-    }
-
-    flag = 0;
-    j = 0;
-    // iterate over all cells assigned to thread
-    while (k <= num_nodes) {
-        if (cell_mass[k] >= 0.0f) {
-            k += inc;
-        } else {
-            if (j == 0) {
-                j = 4;
-                for (i = 0; i < 4; i++) {
-                    ch = children[k*4+i];
-                    child[i*gpu_summary_threads+threadIdx.x] = ch;    // cache children
-                    if ((ch < num_points) || ((mass[i*gpu_summary_threads+threadIdx.x] = cell_mass[ch]) >= 0.0f)) {
-                        j--;
-                    }
-                }
-            } else {
-                j = 4;
-                for (i = 0; i < 4; i++) {
-                    ch = child[i*gpu_summary_threads+threadIdx.x];
-                    if ((ch < num_points) || (mass[i*gpu_summary_threads+threadIdx.x] >= 0.0f) || ((mass[i*gpu_summary_threads+threadIdx.x] = cell_mass[ch]) >= 0.0f)) {
-                        j--;
-                    }
-                }
-            }
-
-            if (j == 0) {
-                // all children are ready
-                cm = 0.0f;
-                px = 0.0f;
-                py = 0.0f;
-                cnt = 0;
-                for (i = 0; i < 4; i++) {
-                    ch = child[i*gpu_summary_threads+threadIdx.x];
-                    if (ch >= 0) {
-                        if (ch >= num_points) {    // count bodies (needed later)
-                            m = mass[i*gpu_summary_threads+threadIdx.x];
-                            cnt += cell_counts[ch];
-                        } else {
-                            m = cell_mass[ch];
-                            cnt++;
-                        }
-                        // add child's contribution
-                        cm += m;
-                        px += x_pos_device[ch] * m;
-                        py += y_pos_device[ch] * m;
-                    }
-                }
-                cell_counts[k] = cnt;
-                m = 1.0f / cm;
-                x_pos_device[k] = px * m;
-                y_pos_device[k] = py * m;
-                flag = 1;
-            }
-        }
-        __syncthreads();    
-        __threadfence();
-        if (flag != 0) {
-            cell_mass[k] = cm;
-            k += inc;
-            flag = 0;
-        }
-    }
-}
-
-void tsnecuda::bh::SummarizeTree(tsnecuda::GpuOptions &gpu_opt,
-                                 thrust::device_vector<int> &cell_counts,
-                                 thrust::device_vector<int> &children,
-                                 thrust::device_vector<float> &cell_mass,
-                                 thrust::device_vector<float> &pts_device,
-                                 const int num_nodes,
-                                 const int num_points,
-                                 const int num_blocks)
-{
-    tsnecuda::bh::SummarizationKernel<<<num_blocks * gpu_opt.summary_kernel_factor,
-                                        gpu_opt.summary_kernel_threads,
-                                        sizeof(int)*4*gpu_opt.summary_kernel_threads + 
-                                        sizeof(float)*4*gpu_opt.summary_kernel_threads>>>(
-                                                    thrust::raw_pointer_cast(cell_counts.data()),
-                                                    thrust::raw_pointer_cast(cell_mass.data()),
-                                                    thrust::raw_pointer_cast(pts_device.data()),
-                                                    thrust::raw_pointer_cast(pts_device.data() + num_nodes + 1),
-                                                    thrust::raw_pointer_cast(children.data()),
-                                                    num_nodes, num_points,
-                                                    gpu_opt.summary_kernel_threads);
-    GpuErrorCheck(cudaDeviceSynchronize());
-}
diff --git a/src/naive_tsne.cu b/src/naive_tsne.cu
deleted file mode 100644
index 9785b6d..0000000
--- a/src/naive_tsne.cu
+++ /dev/null
@@ -1,416 +0,0 @@
-/**
- * @brief Implementation of naive T-SNE
- * 
- * @file naive_tsne.cu
- * @author David Chan
- * @date 2018-04-04
- */
-
-#include "naive_tsne.h"
-
-__global__ void upper_lower_assign(float * __restrict__ sigmas,
-                                    float * __restrict__ lower_bound,
-                                    float * __restrict__ upper_bound,
-                                    const float * __restrict__ perplexity,
-                                    const float target_perplexity,
-                                    const unsigned int N)
-{
-    int TID = threadIdx.x + blockIdx.x * blockDim.x;
-    if (TID > N) return;
-
-    if (perplexity[TID] > target_perplexity)
-        upper_bound[TID] = sigmas[TID];
-    else
-        lower_bound[TID] = sigmas[TID];
-    sigmas[TID] = (upper_bound[TID] + lower_bound[TID])/2.0f;
-}
-
-// TODO: replace this with the bhtsne version
-void NaiveTSNE::thrust_search_perplexity(cublasHandle_t &handle,
-                        thrust::device_vector<float> &sigmas,
-                        thrust::device_vector<float> &lower_bound,
-                        thrust::device_vector<float> &upper_bound,
-                        thrust::device_vector<float> &perplexity,
-                        const thrust::device_vector<float> &pij,
-                        const float target_perplexity,
-                        const unsigned int N)
-{
-    // std::cout << "pij:" << std::endl;
-    // printarray(pij, N, N);
-    // std::cout << std::endl;
-    thrust::device_vector<float> entropy_(pij.size());
-    thrust::transform(pij.begin(), pij.end(), entropy_.begin(), tsnecuda::util::FunctionalEntropy());
-    tsnecuda::util::ZeroDeviceMatrixDiagonal(entropy_, N);
-
-    // std::cout << "entropy:" << std::endl;
-    // printarray(entropy_, N, N);
-    // std::cout << std::endl;
-
-    auto neg_entropy = tsnecuda::util::ReduceAlpha(handle, entropy_, N, N, -1.0f, 1);
-
-    // std::cout << "neg_entropy:" << std::endl;
-    // printarray(neg_entropy, 1, N);
-    // std::cout << std::endl;
-    thrust::transform(neg_entropy.begin(), neg_entropy.end(), perplexity.begin(), tsnecuda::util::FunctionalPower2());
-    // std::cout << "perplexity:" << std::endl;
-    // printarray(perplexity, 1, N);
-    // std::cout << std::endl;
-    
-    const unsigned int BLOCKSIZE = 32;
-    const unsigned int NBLOCKS = iDivUp(N, BLOCKSIZE);
-    upper_lower_assign<<<NBLOCKS,BLOCKSIZE>>>(thrust::raw_pointer_cast(sigmas.data()),
-                                                thrust::raw_pointer_cast(lower_bound.data()),
-                                                thrust::raw_pointer_cast(upper_bound.data()),
-                                                thrust::raw_pointer_cast(perplexity.data()),
-                                                target_perplexity,
-                                                N);
-    // std::cout << "sigmas" << std::endl;
-    // printarray(sigmas, 1, N);
-    // std::cout << std::endl;
-
-}
-
-thrust::device_vector<float> NaiveTSNE::search_perplexity(cublasHandle_t &handle,
-                        thrust::device_vector<float> &points,
-                        const float perplexity_target,
-                        const float eps,
-                        const unsigned int N,
-                        const unsigned int NDIMS) {
-
-    thrust::device_vector<float> sigmas(N, 500.0f);
-    thrust::device_vector<float> best_sigmas(N);
-    thrust::device_vector<float> perplexity(N);
-    thrust::device_vector<float> lbs(N, 0.0f);
-    thrust::device_vector<float> ubs(N, 1000.0f);
-
-    thrust::device_vector<float> pij(N*N);
-    NaiveTSNE::compute_pij(handle, pij, points, sigmas, N, NDIMS);
-    float best_perplexity = 1000.0f;
-    float perplexity_diff = 50.0f;
-    int iters = 0;
-    while (perplexity_diff > eps) {
-         NaiveTSNE::thrust_search_perplexity(handle, sigmas, lbs, ubs, perplexity, pij, perplexity_target, N);
-         perplexity_diff = abs(thrust::reduce(perplexity.begin(), perplexity.end())/((float) N) - perplexity_target);
-         if (perplexity_diff < best_perplexity){
-             best_perplexity = perplexity_diff;
-            //  printf("!! Best perplexity found in %d iterations: %0.5f\n", iters, perplexity_diff);
-             thrust::copy(sigmas.begin(), sigmas.end(), best_sigmas.begin());
-         }
-         NaiveTSNE::compute_pij(handle, pij, points, sigmas, N, NDIMS);
-         iters++;
-    } // Close perplexity search
-
-    return best_sigmas;
-}
-
-// TODO: put this in the same file as bhtsne compute_pij
-void NaiveTSNE::compute_pij(
-                        cublasHandle_t &handle, 
-                        thrust::device_vector<float> &pij,
-                        const thrust::device_vector<float> &points, 
-                        const thrust::device_vector<float> &sigma,
-                        const unsigned int N, 
-                        const unsigned int NDIMS) 
-{
-    tsnecuda::util::SquaredPairwiseDistance(handle, pij, points, N, NDIMS);
-
-    // std::cout << "Sigma:" << std::endl;
-    // printarray(sigma, N, 1);
-    thrust::device_vector<float> sigma_squared(sigma.size());
-    tsnecuda::util::SquareDeviceVector(sigma_squared, sigma);
-    
-    // divide column by sigmas (matrix[i,:] gets divided by sigma_i^2)
-    tsnecuda::util::BroadcastMatrixVector(pij, sigma_squared, N, N, thrust::divides<float>(), 0, -2.0f);
-    thrust::transform(pij.begin(), pij.end(), pij.begin(), tsnecuda::util::FunctionalExp());
-    tsnecuda::util::ZeroDeviceMatrixDiagonal(pij, N);
-    
-    // tsnecuda::util::ReduceSum over cols? rows? Fuck if I know. 
-    auto sums = tsnecuda::util::ReduceSum(handle, pij, N, N, 1);
-
-    // divide column by resulting vector
-    tsnecuda::util::BroadcastMatrixVector(pij, sums, N, N, thrust::divides<float>(), 0, 1.0f);
-}
-
-void NaiveTSNE::symmetrize_pij(cublasHandle_t &handle, 
-                                thrust::device_vector<float> &pij, 
-                                const unsigned int N)
-{
-    float alpha = 0.5f;
-    float beta = 0.5f;
-    thrust::device_vector<float> pij_out(N*N);
-    CublasSafeCall(cublasSgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, N, N, &alpha, thrust::raw_pointer_cast(pij.data()), N, 
-                               &beta, thrust::raw_pointer_cast(pij.data()), N, thrust::raw_pointer_cast(pij_out.data()), N));
-    pij = pij_out;
-}
-
-
-/**
-  * Gradient formula from http://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf
-  * 
-  * Given by ->
-  *     forces_i = 4 * \sum_j (pij - qij)(yi - yj)(1 + ||y_i - y_j||^2)^-1
-  * 
-  * Notation below - in comments, actual variables in the code are referred to by <varname>_ to differentiate from the mathematical quantities
-  *                     It's hard to name variables correctly because we don't want to keep allocating more memory. There's probably a better solution than this though.
-  */
-float NaiveTSNE::compute_gradients(cublasHandle_t &handle, 
-                        thrust::device_vector<float> &forces,
-                        thrust::device_vector<float> &dist, 
-                        thrust::device_vector<float> &ys, 
-                        thrust::device_vector<float> &pij, 
-                        thrust::device_vector<float> &qij, 
-                        const unsigned int N,
-                        const unsigned int PROJDIM,
-                        float eta) 
-{
-    // dist_ = ||y_i - y_j||^2
-    tsnecuda::util::SquaredPairwiseDistance(handle, dist, ys, N, PROJDIM);
-
-    // std::cout << std::endl << std::endl << "Dist" << std::endl;
-    // printarray(dist, N, N);
-
-    // dist_ = (1 + ||y_i - y_j||^2)^-1
-    thrust::transform(dist.begin(), dist.end(), dist.begin(), tsnecuda::util::FunctionalIncrementInverse());
-    tsnecuda::util::ZeroDeviceMatrixDiagonal(dist, N);
-
-    // std::cout << std::endl << std::endl << "Inc-Inv Dist" << std::endl;
-    // printarray(dist, N, N);
-
-    // qij_ = (1 + ||y_i - y_j||^2)^-1 / \Sum_{k != i} (1 + ||y_i - y_k||^2)^-1
-    float sum = thrust::reduce(dist.begin(), dist.end(), 0.0f, thrust::plus<float>());
-    thrust::transform(dist.begin(), dist.end(), thrust::make_constant_iterator<float>(sum), qij.begin(), thrust::divides<float>());
-
-    // auto sums = tsnecuda::util::ReduceSum(handle, qij, N, N, 1);
-
-    // std::cout << std::endl << std::endl << "Sum-Dist" << std::endl;
-    // printarray(sums, 1, N);
-
-    // tsnecuda::util::BroadcastMatrixVector(qij, sums, N, N, thrust::divides<float>(), 0, 1.0f);
-
-    // std::cout << std::endl << std::endl << "Qij" << std::endl;
-    // printarray(qij, N, N);
-
-    // Compute loss = \sum_ij pij * log(pij / qij)
-    thrust::device_vector<float> loss_(N * N);
-    thrust::transform(pij.begin(), pij.end(), qij.begin(), loss_.begin(), tsnecuda::util::FunctionalKlDivergence());
-    tsnecuda::util::ZeroDeviceMatrixDiagonal(loss_, N);
-
-    // printarray(loss_, N, N);
-    float loss = thrust::reduce(loss_.begin(), loss_.end(), 0.0f, thrust::plus<float>());
-
-    // qij_ = pij - qij
-    thrust::transform(pij.begin(), pij.end(), qij.begin(), qij.begin(), thrust::minus<float>());
-
-    // std::cout << std::endl << std::endl << "Pij-Qij" << std::endl;
-    // printarray(qij, N, N);
-
-    // qij_ = (pij - qij)(1 + ||y_i - y_j||^2)^-1
-    thrust::transform(qij.begin(), qij.end(), dist.begin(), qij.begin(), thrust::multiplies<float>());
-
-    // std::cout << std::endl << std::endl << "A" << std::endl;
-    // printarray(qij, N, N);
-
-    // forces_ = \sum_j (pij - qij)(1 + ||y_i - y_j||^2)^-1
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    thrust::device_vector<float> ones(PROJDIM * N, 1.0f);
-    CublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, PROJDIM, N, &alpha, 
-                                thrust::raw_pointer_cast(qij.data()), N, thrust::raw_pointer_cast(ones.data()), N, &beta, 
-                                thrust::raw_pointer_cast(forces.data()), N));
-
-    // std::cout << std::endl << std::endl << "A * 1" << std::endl;
-    // printarray(forces, 2, N);
-
-    // forces_ = y_i * \sum_j (pij - qij)(1 + ||y_i - y_j||^2)^-1
-    thrust::transform(forces.begin(), forces.end(), ys.begin(), forces.begin(), thrust::multiplies<float>());
-    alpha = eta;
-    beta = -eta;
-    // forces_ = 4 * y_i * \sum_j (pij - qij)(1 + ||y_i - y_j||^2)^-1 - 4 * \sum_j y_j(pij - qij)(1 + ||y_i - y_j||^2)^-1
-    CublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, PROJDIM, N, &alpha, 
-                                thrust::raw_pointer_cast(qij.data()), N, thrust::raw_pointer_cast(ys.data()), N, &beta, 
-                                thrust::raw_pointer_cast(forces.data()), N));
-
-    // std::cout << std::endl << std::endl << "Final Forces" << std::endl;
-    // printarray(forces, 2, N);
-
-    return loss;
-}
-
-thrust::device_vector<float> NaiveTSNE::tsne(cublasHandle_t &handle, 
-                                        thrust::device_vector<float> &points, 
-                                        const unsigned int N, 
-                                        const unsigned int NDIMS,
-                                        const unsigned int PROJDIM) {
-    tsnecuda::util::MaxNormalizeDeviceVector(points);
-    std::default_random_engine generator(time(NULL));
-
-    // Choose the right sigmas
-    std::cout << "Selecting sigmas to match perplexity..." << std::endl;
-
-    //TODO: Fix perplexity search
-
-    float perplexity_target = 8.0f;
-    float eps = 1e-2;
-
-    thrust::device_vector<float> sigmas = NaiveTSNE::search_perplexity(handle, points, perplexity_target, eps, N, NDIMS);
-    thrust::device_vector<float> pij(N*N);
-    NaiveTSNE::compute_pij(handle, pij, points, sigmas, N, NDIMS);
-    NaiveTSNE::symmetrize_pij(handle, pij, N);
-
-    //std::cout << "Pij" << std::endl;
-    //printarray(pij, N, N);
-    
-    thrust::device_vector<float> forces(N * PROJDIM);
-    thrust::device_vector<float> ys = tsnecuda::util::RandomDeviceUniformZeroOneVector(generator, N * PROJDIM);
-    
-    // Momentum variables
-    thrust::device_vector<float> yt_1(N * PROJDIM);
-    thrust::device_vector<float> momentum(N * PROJDIM);
-    float momentum_weight = 0.8f;
-
-
-    //printarray(ys, N, 2);
-    thrust::device_vector<float> qij(N * N);
-    thrust::device_vector<float> dist(N * N);
-    float eta = 1.00f;
-    float loss = 0.0f;
-
-    
-    #ifdef DEBUG
-        // Dump the original points
-        std::ofstream dump_points_file;
-        dump_points_file.open ("dump_points.txt");
-        dump_points_file << N << " " << NDIMS << std::endl;
-        for (int i = 0; i < N; i++) {
-            for (int j = 0; j < NDIMS; j++) {
-                dump_points_file << points[i + j*N] << " ";
-            }
-            dump_points_file << std::endl;
-        }
-        dump_points_file.close();
-    #endif
-
-    
-    #ifdef DEBUG
-        // Create a dump file for the points
-        std::ofstream dump_file;
-        dump_file.open("dump_ys.txt");
-        float host_ys[N * PROJDIM];
-        dump_file << N << " " << PROJDIM << std::endl;
-    #endif
-
-    for (int i = 0; i < 1000; i++) {
-        loss = NaiveTSNE::compute_gradients(handle, forces, dist, ys, pij, qij, N, PROJDIM, eta);
-        
-        // Compute the momentum
-        thrust::transform(ys.begin(), ys.end(), yt_1.begin(), momentum.begin(), thrust::minus<float>());
-        thrust::transform(momentum.begin(), momentum.end(), thrust::make_constant_iterator(momentum_weight), momentum.begin(), thrust::multiplies<float>() );
-        thrust::copy(ys.begin(), ys.end(), yt_1.begin());
-
-        // Apply the forces
-        thrust::transform(ys.begin(), ys.end(), forces.begin(), ys.begin(), thrust::plus<float>());
-        thrust::transform(ys.begin(), ys.end(), momentum.begin(), ys.begin(), thrust::plus<float>());
-
-        //TODO: Add early termination for loss deltas
-        
-        if (i % 100 == 0)
-            std::cout << "Iteration: " << i << ", Loss: " << loss << ", ForceMag: " << tsnecuda::util::L2NormDeviceVector(forces) << std::endl;
-
-        #ifdef DEBUG
-            // Dump the points
-            thrust::copy(ys.begin(), ys.end(), host_ys);
-            for (int i = 0; i < N; i++) {
-                for (int j = 0; j < PROJDIM; j++) {
-                    dump_file << host_ys[i + j*N] << " ";
-                }
-                dump_file << std::endl;
-            }
-        #endif
-    }
-    #ifdef DEBUG
-        dump_file.close();
-    #endif
-    return ys;
-}
-
-thrust::device_vector<float> NaiveTSNE::tsne(cublasHandle_t &handle, 
-                                                thrust::device_vector<float> &d_points, 
-                                                unsigned int N_POINTS, 
-                                                unsigned int N_DIMS, 
-                                                unsigned int PROJDIM, 
-                                                float perplexity, 
-                                                float early_ex, 
-                                                float learning_rate, 
-                                                unsigned int n_iter, 
-                                                unsigned int n_iter_np, 
-                                                float min_g_norm){
-    
-    // Compute a max-norm on the points
-    tsnecuda::util::MaxNormalizeDeviceVector(d_points);
-    
-    // Setup a random generator
-    std::default_random_engine generator(time(NULL));
-
-    // Choose the right sigmas
-    std::cout << "Selecting sigmas to match perplexity..." << std::endl;
-    float eps = 1e-2;
-    thrust::device_vector<float> sigmas = NaiveTSNE::search_perplexity(handle, d_points, perplexity, eps, N_POINTS, N_DIMS);
-    thrust::device_vector<float> pij(N_POINTS*N_POINTS);
-    NaiveTSNE::compute_pij(handle, pij, d_points, sigmas, N_POINTS, N_DIMS);
-    NaiveTSNE::symmetrize_pij(handle, pij, N_POINTS);
-
-    // Allocate some memory for the foces and such
-    thrust::device_vector<float> forces(N_POINTS * PROJDIM);
-    thrust::device_vector<float> ys = tsnecuda::util::RandomDeviceUniformZeroOneVector(generator, N_POINTS * PROJDIM);
-    
-    // Momentum variables
-    thrust::device_vector<float> yt_1(N_POINTS * PROJDIM);
-    thrust::device_vector<float> momentum(N_POINTS * PROJDIM);
-    
-
-    // Qij and distance vector allocations
-    thrust::device_vector<float> qij(N_POINTS * N_POINTS);
-    thrust::device_vector<float> dist(N_POINTS * N_POINTS);
-
-    // Setup the learning rate
-    float eta = learning_rate*early_ex;
-    float momentum_weight = 0.5f;
-    float loss = 0.0f;
-    bool using_early = true;
-
-    float best_error = 0.0;
-    int best_iter = 0;
-
-    for (int i = 0; i < n_iter; i++) {
-
-        // Check for and turn off early exaggeration
-        if (using_early) { if (i > 250) {
-            using_early = false;
-            eta /= early_ex;
-            momentum_weight = 0.8;
-        }}
-        
-        // Compute the loss/gradients
-        loss = NaiveTSNE::compute_gradients(handle, forces, dist, ys, pij, qij, N_POINTS, PROJDIM, eta);
-        
-        // Compute the momentum
-        thrust::transform(ys.begin(), ys.end(), yt_1.begin(), momentum.begin(), thrust::minus<float>());
-        thrust::transform(momentum.begin(), momentum.end(), thrust::make_constant_iterator(momentum_weight), momentum.begin(), thrust::multiplies<float>() );
-        thrust::copy(ys.begin(), ys.end(), yt_1.begin());
-
-        // Apply the forces
-        thrust::transform(ys.begin(), ys.end(), forces.begin(), ys.begin(), thrust::plus<float>());
-        thrust::transform(ys.begin(), ys.end(), momentum.begin(), ys.begin(), thrust::plus<float>());
-
-        // Terminate if we're not changing loss much
-        if (loss < best_error || i == 0) {
-            best_error = loss;
-            best_iter = i;
-        } else {if (i - best_iter > n_iter_np) break;}
-
-        // Terminate if we're less than the minimum gradient norm
-        if (tsnecuda::util::L2NormDeviceVector(forces) < min_g_norm) break;
-    }
-    return ys;
-}
-
diff --git a/src/naive_tsne_cpu.cu b/src/naive_tsne_cpu.cu
deleted file mode 100644
index ba57ea0..0000000
--- a/src/naive_tsne_cpu.cu
+++ /dev/null
@@ -1,183 +0,0 @@
-#include "common.h"
-
-std::vector<float> squared_pairwise_dist(std::vector<float> &points, const unsigned int N, const unsigned int NDIMS) {
-	std::vector<float> squared_pairwise_dist(N * N, 0);
-	for (int i = 0; i < N; i++) {
-		for (int j = 0; j < N; j++) {
-			for(int k = 0; k < NDIMS; k++) {
-				squared_pairwise_dist[i * N + j] += (points[i * NDIMS + k] - points[j * NDIMS + k]) * (points[i * NDIMS + k] - points[j * NDIMS + k]); 
-			}
-		}
-    }
-	return squared_pairwise_dist;
-}
-
-
-
-bool perplexity_equal(const float delta, float perplexity, float target_perplexity) {
-	return (perplexity >= target_perplexity - delta) && (perplexity <= target_perplexity + delta);
-}
-
-
-float get_perplexity(std::vector<float> & pij, const unsigned int i, unsigned int N) {
-	float entropy = 0.0f;
-	for (int j = 0; j < N; j++) {
-		if (j != i) {
-			entropy += pij[i*N + j] * std::log2(pij[i*N + j]);
-		}
-	}
-	return std::pow(2, -entropy);
-}
-
-bool compare_perplexity(std::vector<float>& pij, 
-					   float& lo, 
-					   float& mid, 
-					   float& hi, 
-					   const unsigned int i, 
-					   const unsigned int N, 
-					   const float delta, 
-					   const float target_perplexity) {
-	float perplexity = get_perplexity(pij, i, N);
-	if (perplexity_equal(delta, perplexity, target_perplexity)) {
-		return true;
-	} else if (perplexity > target_perplexity) {
-		hi = mid - delta;
-	} else {
-		lo = mid + delta;
-	}
-
-	mid = (lo + hi)/2;
-	return false;
-}
-
-
-void recompute_pij_row_cpu(std::vector<float> &points, 
-	                           std::vector<float> &pij, 
-	                           float sigma,
-	                           float i, 
-	                           const unsigned int N, 
-	                           const unsigned int NDIMS) {
-	std::vector<float> dists = squared_pairwise_dist(points, N, NDIMS);
-	for (int j = 0; j < N; j++) {
-		float denom = 0;
-		for (int k = 0; k < N; k++) {
-			if (k != i) {
-				denom += std::exp(-(dists[i * N + k] / (2 * sigma * sigma)));
-			}
-		}
-        if (i != j) {
-		    pij[i * N + j] = std::exp(-dists[i * N + j] / (2 * sigma * sigma)) / denom;
-        }
-
-    }
-}
-
-std::vector<float> compute_pij_cpu(std::vector<float> &points, 
-	                           std::vector<float> &sigma, 
-	                           const unsigned int N, 
-	                           const unsigned int NDIMS) {
-	std::vector<float> pij_out(N * N, 0.0f);
-	std::vector<float> dists = squared_pairwise_dist(points, N, NDIMS);
-
-	for (int i = 0; i < N; i++) {
-		for (int j = 0; j < N; j++) {
-			float denom = 0;
-			for (int k = 0; k < N; k++) {
-				if (k != i) {
-					denom += std::exp(-(dists[i * N + k] / (2 * sigma[i] * sigma[i])));
-				}
-			}
-            if (i != j) {
-			    pij_out[i * N + j] = std::exp(-dists[i * N + j] / (2 * sigma[i] * sigma[i])) / denom;
-            }
-           
-        }
-	}
-	return pij_out;
-
-}
-
-std::vector<float> compute_qij_cpu(std::vector<float>& ys, const unsigned int N, const unsigned int PROJDIMS) {
-	
-	std::vector<float> qij_out(N * N);
-	std::vector<float> dists = squared_pairwise_dist(ys, N, PROJDIMS);
-
-	for (int i = 0; i < N; i++) {
-		for (int j = 0; j < N; j++) {
-			float denom = 0;
-			for (int k = 0; k < N; k++) {
-				if (k != i) {
-					denom += 1 / (1 + (dists[i * N + k]));
-				}
-			}
-			qij_out[i * N + j] = (1 / (1 + dists[i * N + j])) / denom;
-		}
-	}
-	return qij_out;
-}
-
-float kl_div(float pij, float qij) {
-	return pij * std::log(pij / qij);
-}
-
-float compute_gradients_cpu(std::vector<float> &forces,
-                    	std::vector<float> &dist, 
-                        std::vector<float> &ys, 
-                        std::vector<float> &pij, 
-                        std::vector<float> &qij, 
-                        const unsigned int N,
-                        float eta) {
-
-	float loss = 0.0f;
-	for (int i = 0; i < N; i++) {
-		for (int j = 0; j < N; j++) {
-			float pij_sym = (pij[i * N + j] + pij[j * N + i]) / (2 * N);
-			float qij_sym = (qij[i * N + j] + qij[j * N + i]) / (2 * N);
-			loss += kl_div(pij_sym, qij_sym);
-		}
-
-	}
-	return loss;
-
-}
-
-
-std::vector<float> sigmas_search_cpu(std::vector<float> &points,  
-	                           const unsigned int N, 
-	                           const unsigned int NDIMS,
-	                           float target_perplexity) {
-	const float max_sigma = 1000.0f;
-	const float delta = 0.1f;
-	std::vector<float> sigmas(N, max_sigma/2);
-	std::vector<float> pij = compute_pij_cpu(points, sigmas, N, NDIMS);
-	for (int i = 0; i < N; i++) {
-		bool found = false;
-		float lo = 0.0f;
-		float hi = max_sigma;
-		float mid = (lo + hi)/ 2;
-		while (!found) {
-			found = compare_perplexity(pij, lo, mid, hi, i, N, delta, target_perplexity);
-			recompute_pij_row_cpu(points, pij, mid, i, N, NDIMS);
-        }
-		sigmas[i] = mid;
-	}
-	return sigmas;
-
-}
-
-std::vector<float> naive_tsne_cpu(std::vector<float> &points, 
-                              const unsigned int N, 
-                              const unsigned int NDIMS) {
-	std::default_random_engine generator;
-  	std::uniform_real_distribution<double> distribution(-10.0f,10.0f);
- 	const unsigned int NPROJDIM = 2;
- 	std::vector<float> ys(N * NPROJDIM);
- 	for (int i = 0; i < N * NPROJDIM; i++) {
- 		ys[i] = distribution(generator);
- 	}
-	for (int i = 0; i < 1000; i++) {
-
-	}
-    return ys;
-
-}
diff --git a/src/nbodyfft.cu b/src/nbodyfft.cu
index 586e4a1..177116e 100644
--- a/src/nbodyfft.cu
+++ b/src/nbodyfft.cu
@@ -1,7 +1,4 @@
-#include "nbodyfft.h"
-#include <cufft.h>
-#include "include/util/cuda_utils.h"
-#include "include/util/matrix_broadcast_utils.h"
+#include "include/nbodyfft.h"
 
 #define cufftSafeCall(err)  __cufftSafeCall(err, __FILE__, __LINE__)
 
@@ -57,7 +54,7 @@ inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
     }
 }
 
-__global__ void copy_to_fft_input(volatile float * __restrict__ fft_input, 
+__global__ void copy_to_fft_input(volatile float * __restrict__ fft_input,
                                   const float * w_coefficients_device,
                                   const int n_fft_coeffs,
                                   const int n_fft_coeffs_half,
@@ -70,14 +67,14 @@ __global__ void copy_to_fft_input(volatile float * __restrict__ fft_input,
 
     register int current_term = TID / (n_fft_coeffs_half * n_fft_coeffs_half);
     register int current_loc = TID % (n_fft_coeffs_half * n_fft_coeffs_half);
-    
+
     i = current_loc / n_fft_coeffs_half;
     j = current_loc % n_fft_coeffs_half;
 
     fft_input[current_term * (n_fft_coeffs * n_fft_coeffs) + i * n_fft_coeffs + j] = w_coefficients_device[current_term + current_loc * n_terms];
 }
 
-__global__ void copy_from_fft_output(volatile float * __restrict__ y_tilde_values, 
+__global__ void copy_from_fft_output(volatile float * __restrict__ y_tilde_values,
     const float * fft_output,
     const int n_fft_coeffs,
     const int n_fft_coeffs_half,
@@ -107,12 +104,12 @@ __global__ void compute_point_box_idx(volatile int * __restrict__ point_box_idx,
                                       const float box_width,
                                       const int n_boxes,
                                       const int n_total_boxes,
-                                      const int N) 
+                                      const int N)
 {
     register int TID = threadIdx.x + blockIdx.x * blockDim.x;
     if (TID >= N)
         return;
-    
+
     register int x_idx = (int) ((xs[TID] - coord_min) / box_width);
     register int y_idx = (int) ((ys[TID] - coord_min) / box_width);
 
@@ -146,7 +143,7 @@ __global__ void interpolate_device(
 
     i = TID % N;
     j = TID / N;
-    
+
     value = 1;
     ybox_i = y_in_box[i];
 
@@ -174,7 +171,7 @@ __global__ void compute_interpolated_indices(
     TID = threadIdx.x + blockIdx.x * blockDim.x;
     if (TID >= n_terms * n_interpolation_points * n_interpolation_points * N)
         return;
-    
+
     current_term = TID % n_terms;
     i = (TID / n_terms) % N;
     interp_j = ((TID / n_terms) / N) % n_interpolation_points;
@@ -189,7 +186,7 @@ __global__ void compute_interpolated_indices(
                                 (box_j * n_interpolation_points) + interp_j;
     // interpolated_indices[TID] = idx * n_terms + current_term;
     atomicAdd(
-        w_coefficients_device + idx * n_terms + current_term, 
+        w_coefficients_device + idx * n_terms + current_term,
         x_interpolated_values[i + interp_i * N] * y_interpolated_values[i + interp_j * N] * chargesQij[i * n_terms + current_term]);
 }
 
@@ -208,7 +205,7 @@ __global__ void compute_potential_indices(
     TID = threadIdx.x + blockIdx.x * blockDim.x;
     if (TID >= n_terms * n_interpolation_points * n_interpolation_points * N)
         return;
-    
+
     current_term = TID % n_terms;
     i = (TID / n_terms) % N;
     interp_j = ((TID / n_terms) / N) % n_interpolation_points;
@@ -223,7 +220,7 @@ __global__ void compute_potential_indices(
     // interpolated_values[TID] = x_interpolated_values[i + interp_i * N] * y_interpolated_values[i + interp_j * N] * y_tilde_values[idx * n_terms + current_term];
     // interpolated_indices[TID] = i * n_terms + current_term;
     atomicAdd(
-        potentialsQij + i * n_terms + current_term, 
+        potentialsQij + i * n_terms + current_term,
         x_interpolated_values[i + interp_i * N] * y_interpolated_values[i + interp_j * N] * y_tilde_values[idx * n_terms + current_term]);
 }
 
@@ -244,7 +241,7 @@ __global__ void compute_kernel_tilde(
     TID = threadIdx.x + blockIdx.x * blockDim.x;
     if (TID >= n_interpolation_points_1d * n_interpolation_points_1d)
         return;
-    
+
     i = TID / n_interpolation_points_1d;
     j = TID % n_interpolation_points_1d;
 
@@ -253,8 +250,8 @@ __global__ void compute_kernel_tilde(
     kernel_tilde[(n_interpolation_points_1d - i) * n_fft_coeffs + (n_interpolation_points_1d + j)] = tmp;
     kernel_tilde[(n_interpolation_points_1d + i) * n_fft_coeffs + (n_interpolation_points_1d - j)] = tmp;
     kernel_tilde[(n_interpolation_points_1d - i) * n_fft_coeffs + (n_interpolation_points_1d - j)] = tmp;
-    
-} 
+
+}
 
 __global__ void compute_upper_and_lower_bounds(
     volatile float * __restrict__ box_upper_bounds,
@@ -293,9 +290,18 @@ __global__ void copy_to_w_coefficients(
     w_coefficients_device[output_indices[TID]] = output_values[TID];
 }
 
-void precompute_2d(cufftHandle &plan_kernel_tilde, float x_max, float x_min, float y_max, float y_min, int n_boxes, int n_interpolation_points,
-                   thrust::device_vector<float> &box_lower_bounds_device, thrust::device_vector<float> &box_upper_bounds_device, 
-                   thrust::device_vector<float> &kernel_tilde_device, thrust::device_vector<thrust::complex<float>> &fft_kernel_tilde_device) {
+void tsnecuda::PrecomputeFFT2D(
+        cufftHandle &plan_kernel_tilde,
+        float x_max,
+        float x_min,
+        float y_max,
+        float y_min,
+        int n_boxes,
+        int n_interpolation_points,
+        thrust::device_vector<float> &box_lower_bounds_device,
+        thrust::device_vector<float> &box_upper_bounds_device,
+        thrust::device_vector<float> &kernel_tilde_device,
+        thrust::device_vector<thrust::complex<float> > &fft_kernel_tilde_device) {
     const int num_threads = 32;
     int num_blocks = (n_boxes * n_boxes + num_threads - 1) / num_threads;
     /*
@@ -303,7 +309,7 @@ void precompute_2d(cufftHandle &plan_kernel_tilde, float x_max, float x_min, flo
      */
     int n_total_boxes = n_boxes * n_boxes;
     float box_width = (x_max - x_min) / (float) n_boxes;
-    
+
     // Left and right bounds of each box, first the lower bounds in the x direction, then in the y direction
     compute_upper_and_lower_bounds<<<num_blocks, num_threads>>>(
         thrust::raw_pointer_cast(box_upper_bounds_device.data()),
@@ -323,27 +329,27 @@ void precompute_2d(cufftHandle &plan_kernel_tilde, float x_max, float x_min, flo
     // thrust::device_vector<float> kernel_tilde_device(n_fft_coeffs * n_fft_coeffs);
     num_blocks = (n_interpolation_points_1d * n_interpolation_points_1d + num_threads - 1) / num_threads;
     compute_kernel_tilde<<<num_blocks, num_threads>>>(
-        thrust::raw_pointer_cast(kernel_tilde_device.data()), 
+        thrust::raw_pointer_cast(kernel_tilde_device.data()),
         x_min, y_min, h, n_interpolation_points_1d, n_fft_coeffs);
     GpuErrorCheck(cudaDeviceSynchronize());
-    
+
     // Precompute the FFT of the kernel generating matrix
-    
-    cufftExecR2C(plan_kernel_tilde, 
+
+    cufftExecR2C(plan_kernel_tilde,
         reinterpret_cast<cufftReal *>(thrust::raw_pointer_cast(kernel_tilde_device.data())),
         reinterpret_cast<cufftComplex *>(thrust::raw_pointer_cast(fft_kernel_tilde_device.data())));
-    
+
 }
 
 
 
-void n_body_fft_2d(
+void tsnecuda::NbodyFFT2D(
     cufftHandle &plan_dft,
     cufftHandle &plan_idft,
-    int N, 
-    int n_terms, 
+    int N,
+    int n_terms,
     int n_boxes,
-    int n_interpolation_points, 
+    int n_interpolation_points,
     thrust::device_vector<thrust::complex<float>> &fft_kernel_tilde_device,
     int n_total_boxes,
     int total_interpolation_points,
@@ -355,7 +361,7 @@ void n_body_fft_2d(
     thrust::device_vector<float> &fft_input,
     thrust::device_vector<thrust::complex<float>> &fft_w_coefficients,
     thrust::device_vector<float> &fft_output,
-    thrust::device_vector<int> &point_box_idx_device, 
+    thrust::device_vector<int> &point_box_idx_device,
     thrust::device_vector<float> &x_in_box_device,
     thrust::device_vector<float> &y_in_box_device,
     thrust::device_vector<float> &points_device,
@@ -375,13 +381,13 @@ void n_body_fft_2d(
     // std::cout << "start" << std::endl;
     const int num_threads = 128;
     int num_blocks = (N + num_threads - 1) / num_threads;
-    
+
      // Compute box indices and the relative position of each point in its box in the interval [0, 1]
     compute_point_box_idx<<<num_blocks, num_threads>>>(
         thrust::raw_pointer_cast(point_box_idx_device.data()),
         thrust::raw_pointer_cast(x_in_box_device.data()),
         thrust::raw_pointer_cast(y_in_box_device.data()),
-        thrust::raw_pointer_cast(points_device.data()), 
+        thrust::raw_pointer_cast(points_device.data()),
         thrust::raw_pointer_cast(points_device.data() + num_nodes + 1),
         thrust::raw_pointer_cast(box_lower_bounds_device.data()),
         coord_min,
@@ -392,11 +398,11 @@ void n_body_fft_2d(
     );
 
     GpuErrorCheck(cudaDeviceSynchronize());
+
     /*
      * Step 1: Interpolate kernel using Lagrange polynomials and compute the w coefficients
      */
     // Compute the interpolated values at each real point with each Lagrange polynomial in the `x` direction
-
     num_blocks = (N * n_interpolation_points + num_threads - 1) / num_threads;
     interpolate_device<<<num_blocks, num_threads>>>(
         thrust::raw_pointer_cast(x_interpolated_values_device.data()),
@@ -432,10 +438,11 @@ void n_body_fft_2d(
         n_terms
     );
     GpuErrorCheck(cudaDeviceSynchronize());
+
     /*
      * Step 2: Compute the values v_{m, n} at the equispaced nodes, multiply the kernel matrix with the coefficients w
      */
-    
+
     num_blocks = ((n_terms * n_fft_coeffs_half * n_fft_coeffs_half) + num_threads - 1) / num_threads;
     copy_to_fft_input<<<num_blocks, num_threads>>>(
         thrust::raw_pointer_cast(fft_input.data()),
@@ -446,21 +453,21 @@ void n_body_fft_2d(
     );
     GpuErrorCheck(cudaDeviceSynchronize());
     // Compute fft values at interpolated nodes
-    cufftExecR2C(plan_dft, 
-        reinterpret_cast<cufftReal *>(thrust::raw_pointer_cast(fft_input.data())), 
+    cufftExecR2C(plan_dft,
+        reinterpret_cast<cufftReal *>(thrust::raw_pointer_cast(fft_input.data())),
         reinterpret_cast<cufftComplex *>(thrust::raw_pointer_cast(fft_w_coefficients.data())));
     GpuErrorCheck(cudaDeviceSynchronize());
 
     // Take the broadcasted Hadamard product of a complex matrix and a complex vector
     tsnecuda::util::BroadcastMatrixVector(
-        fft_w_coefficients, fft_kernel_tilde_device, n_fft_coeffs * (n_fft_coeffs / 2 + 1), n_terms, 
+        fft_w_coefficients, fft_kernel_tilde_device, n_fft_coeffs * (n_fft_coeffs / 2 + 1), n_terms,
         thrust::multiplies<thrust::complex<float>>(), 0, thrust::complex<float>(1.0));
 
-    
+
 
     // Invert the computed values at the interpolated nodes
-    cufftExecC2R(plan_idft, 
-        reinterpret_cast<cufftComplex *>(thrust::raw_pointer_cast(fft_w_coefficients.data())), 
+    cufftExecC2R(plan_idft,
+        reinterpret_cast<cufftComplex *>(thrust::raw_pointer_cast(fft_w_coefficients.data())),
         reinterpret_cast<cufftReal *>(thrust::raw_pointer_cast(fft_output.data())));
     GpuErrorCheck(cudaDeviceSynchronize());
     copy_from_fft_output<<<num_blocks, num_threads>>>(
@@ -471,6 +478,7 @@ void n_body_fft_2d(
         n_terms
     );
     GpuErrorCheck(cudaDeviceSynchronize());
+
     /*
      * Step 3: Compute the potentials \tilde{\phi}
      */
@@ -488,37 +496,3 @@ void n_body_fft_2d(
     );
     GpuErrorCheck(cudaDeviceSynchronize());
 }
-
-float* get_ntime() {
-	return _nbody_times;
-}
-
-
-void interpolate(int n_interpolation_points, int N, const float *y_in_box, const float *y_tilde_spacings,
-                 float *interpolated_values, const float *denominator) {
-    // The denominators are the same across the interpolants, so we only need to compute them once
-    // auto *denominator = new float[n_interpolation_points];
-    // for (int i = 0; i < n_interpolation_points; i++) {
-    //     denominator[i] = 1;
-    //     for (int j = 0; j < n_interpolation_points; j++) {
-    //         if (i != j) {
-    //             denominator[i] *= y_tilde_spacings[i] - y_tilde_spacings[j];
-    //         }
-    //     }
-    // }
-
-    // Compute the numerators and the interpolant value
-    for (int i = 0; i < N; i++) {
-        for (int j = 0; j < n_interpolation_points; j++) {
-            interpolated_values[j * N + i] = 1;
-            for (int k = 0; k < n_interpolation_points; k++) {
-                if (j != k) {
-                    interpolated_values[j * N + i] *= y_in_box[i] - y_tilde_spacings[k];
-                }
-            }
-            interpolated_values[j * N + i] /= denominator[j];
-        }
-    }
-
-    // delete[] denominator;
-}

From 2ecccbad04fca3a52a6de862666230b4c8bc3f05 Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 17:06:37 -0700
Subject: [PATCH 05/14] move nbodyfft to kernels, fix CufftSafeCall

---
 CMakeLists.txt                       |  2 +-
 src/fit_tsne.cu                      | 71 +++------------------
 src/include/common.h                 |  5 +-
 src/include/fit_tsne.h               |  1 +
 src/include/{ => kernels}/nbodyfft.h |  6 +-
 src/include/macros.h                 | 14 -----
 src/include/transforms.h             |  0
 src/include/util/cuda_utils.h        | 13 ++--
 src/{ => kernels}/nbodyfft.cu        | 56 +----------------
 src/util/cuda_utils.cu               | 93 +++++++++++++++++++++++++---
 src/util/data_utils.cu               |  6 +-
 11 files changed, 112 insertions(+), 155 deletions(-)
 rename src/include/{ => kernels}/nbodyfft.h (95%)
 delete mode 100644 src/include/macros.h
 delete mode 100644 src/include/transforms.h
 rename src/{ => kernels}/nbodyfft.cu (92%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62a94e2..c609763 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,11 +152,11 @@ set(SOURCES
     src/kernels/apply_forces.cu
     src/kernels/attr_forces.cu
     src/kernels/perplexity_search.cu
+    src/kernels/nbodyfft.cu
 
     # Method files
     src/ext/pymodule_ext.cu
     src/fit_tsne.cu
-    src/nbodyfft.cu
 )
 
 
diff --git a/src/fit_tsne.cu b/src/fit_tsne.cu
index 28cffc2..6bec506 100644
--- a/src/fit_tsne.cu
+++ b/src/fit_tsne.cu
@@ -3,61 +3,12 @@
 */
 
 #include "include/fit_tsne.h"
-#include "include/nbodyfft.h"
 #include <chrono>
 
-#define cufftSafeCall(err)  __cufftSafeCall(err, __FILE__, __LINE__)
-
 #define START_IL_TIMER() start = std::chrono::high_resolution_clock::now();
 #define END_IL_TIMER(x) stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start); x += duration; total_time += duration;
 #define PRINT_IL_TIMER(x) std::cout << #x << ": " << ((float) x.count()) / 1000000.0 << "s" << std::endl
 
-static const char *_cudaGetErrorEnum(cufftResult error)
-{
-    switch (error)
-    {
-        case CUFFT_SUCCESS:
-            return "CUFFT_SUCCESS";
-
-        case CUFFT_INVALID_PLAN:
-            return "CUFFT_INVALID_PLAN";
-
-        case CUFFT_ALLOC_FAILED:
-            return "CUFFT_ALLOC_FAILED";
-
-        case CUFFT_INVALID_TYPE:
-            return "CUFFT_INVALID_TYPE";
-
-        case CUFFT_INVALID_VALUE:
-            return "CUFFT_INVALID_VALUE";
-
-        case CUFFT_INTERNAL_ERROR:
-            return "CUFFT_INTERNAL_ERROR";
-
-        case CUFFT_EXEC_FAILED:
-            return "CUFFT_EXEC_FAILED";
-
-        case CUFFT_SETUP_FAILED:
-            return "CUFFT_SETUP_FAILED";
-
-        case CUFFT_INVALID_SIZE:
-            return "CUFFT_INVALID_SIZE";
-
-        case CUFFT_UNALIGNED_DATA:
-            return "CUFFT_UNALIGNED_DATA";
-    }
-
-    return "<unknown>";
-}
-
-inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
-{
-    if( CUFFT_SUCCESS != err) {
-		fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
-									_cudaGetErrorEnum(err)); \
-		cudaDeviceReset(); assert(0); \
-    }
-}
 
 float squared_cauchy_2d(float x1, float x2, float y1, float y2) {
     return pow(1.0 + pow(x1 - y1, 2) + pow(x2 - y2, 2), -2);
@@ -245,11 +196,7 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
     cusparseSetMatIndexBase(sparse_matrix_descriptor,CUSPARSE_INDEX_BASE_ZERO);
 
     // Setup some return information if we're working on snapshots
-    int snap_interval;
     int snap_num = 0;
-    if (opt.return_style == tsnecuda::RETURN_STYLE::SNAPSHOT) {
-      snap_interval = opt.iterations / (opt.num_snapshots-1);
-    }
 
     // Get constants from options
     const int num_points = opt.num_points;
@@ -461,18 +408,18 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
 
     // Create the FFT Handles
     cufftHandle plan_kernel_tilde, plan_dft, plan_idft;;
-    cufftSafeCall(cufftCreate(&plan_kernel_tilde));
-    cufftSafeCall(cufftCreate(&plan_dft));
-    cufftSafeCall(cufftCreate(&plan_idft));
+    CufftSafeCall(cufftCreate(&plan_kernel_tilde));
+    CufftSafeCall(cufftCreate(&plan_dft));
+    CufftSafeCall(cufftCreate(&plan_idft));
 
     size_t work_size, work_size_dft, work_size_idft;
     int fft_dimensions[2] = {n_fft_coeffs, n_fft_coeffs};
-    cufftSafeCall(cufftMakePlan2d(plan_kernel_tilde, fft_dimensions[0], fft_dimensions[1], CUFFT_R2C, &work_size));
-    cufftSafeCall(cufftMakePlanMany(plan_dft, 2, fft_dimensions,
+    CufftSafeCall(cufftMakePlan2d(plan_kernel_tilde, fft_dimensions[0], fft_dimensions[1], CUFFT_R2C, &work_size));
+    CufftSafeCall(cufftMakePlanMany(plan_dft, 2, fft_dimensions,
                                     NULL, 1, n_fft_coeffs * n_fft_coeffs,
                                     NULL, 1, n_fft_coeffs * (n_fft_coeffs / 2 + 1),
                                     CUFFT_R2C, n_terms, &work_size_dft));
-    cufftSafeCall(cufftMakePlanMany(plan_idft, 2, fft_dimensions,
+    CufftSafeCall(cufftMakePlanMany(plan_idft, 2, fft_dimensions,
                                     NULL, 1, n_fft_coeffs * (n_fft_coeffs / 2 + 1),
                                     NULL, 1, n_fft_coeffs * n_fft_coeffs,
                                     CUFFT_C2R, n_terms, &work_size_idft));
@@ -690,9 +637,9 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
 
     }
 
-    cufftSafeCall(cufftDestroy(plan_kernel_tilde));
-    cufftSafeCall(cufftDestroy(plan_dft));
-    cufftSafeCall(cufftDestroy(plan_idft));
+    CufftSafeCall(cufftDestroy(plan_kernel_tilde));
+    CufftSafeCall(cufftDestroy(plan_dft));
+    CufftSafeCall(cufftDestroy(plan_idft));
 
     if (opt.verbosity > 0) {
         PRINT_IL_TIMER(_time_initialization);
diff --git a/src/include/common.h b/src/include/common.h
index e7bbfa1..885a12a 100644
--- a/src/include/common.h
+++ b/src/include/common.h
@@ -1,6 +1,6 @@
 /**
  * @brief Common includes for the Cuda-TSNE project
- * 
+ *
  * @file common.h
  * @author David Chan
  * @date 2018-04-04
@@ -57,7 +57,4 @@
 #include <iomanip>
 #include <cmath>
 
-// Additional Macros
-#include "macros.h"
-
 #endif
diff --git a/src/include/fit_tsne.h b/src/include/fit_tsne.h
index 87387d9..61fe542 100644
--- a/src/include/fit_tsne.h
+++ b/src/include/fit_tsne.h
@@ -23,6 +23,7 @@
 #include "include/kernels/apply_forces.h"
 #include "include/kernels/attr_forces.h"
 #include "include/kernels/perplexity_search.h"
+#include "include/kernels/nbodyfft.h"
 
 namespace tsnecuda {
 void RunTsne(tsnecuda::Options &opt, tsnecuda::GpuOptions &gpu_opt);
diff --git a/src/include/nbodyfft.h b/src/include/kernels/nbodyfft.h
similarity index 95%
rename from src/include/nbodyfft.h
rename to src/include/kernels/nbodyfft.h
index 01154f5..a5ab5cb 100644
--- a/src/include/nbodyfft.h
+++ b/src/include/kernels/nbodyfft.h
@@ -4,9 +4,9 @@
 #include <complex>
 #include <cufft.h>
 #include <thrust/complex.h>
-#include "common.h"
-#include "include/util/cuda_utils.h"
-#include "include/util/matrix_broadcast_utils.h"
+#include "../common.h"
+#include "../util/cuda_utils.h"
+#include "../util/matrix_broadcast_utils.h"
 
 namespace tsnecuda {
 
diff --git a/src/include/macros.h b/src/include/macros.h
deleted file mode 100644
index 32644d6..0000000
--- a/src/include/macros.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/**
- * @brief Definable macros for the program. Altering these may help with optimization
- * 
- * @file macros.h
- * @author David Chan
- * @date 2018-04-04
- */
-
-#ifndef MACROS_H
-#define MACROS_H
-
-// Nothin here yet!    
-
-#endif
diff --git a/src/include/transforms.h b/src/include/transforms.h
deleted file mode 100644
index e69de29..0000000
diff --git a/src/include/util/cuda_utils.h b/src/include/util/cuda_utils.h
index af55001..111a729 100644
--- a/src/include/util/cuda_utils.h
+++ b/src/include/util/cuda_utils.h
@@ -1,17 +1,17 @@
 /**
  * @brief Utility/Error Checking code.
- * 
+ *
  * @file cuda_utils.cu
  * @date 2018-04-04
- * Copyright (C) 2012-2017 Orange Owl Solutions.  
+ * Copyright (C) 2012-2017 Orange Owl Solutions.
  */
 
  /*
  CUDA Utilities - Utilities for high performance CPUs/C/C++ and GPUs/CUDA computing library.
-    
-    Copyright (C) 2012-2017 Orange Owl Solutions.  
 
-    
+    Copyright (C) 2012-2017 Orange Owl Solutions.
+
+
     CUDA Utilities is free software: you can redistribute it and/or modify
     it under the terms of the Lesser GNU General Public License as published by
     the Free Software Foundation, either version 3 of the License, or
@@ -34,11 +34,12 @@
 #ifndef CUDA_UTILITIES_CUH
 #define CUDA_UTILITIES_CUH
 
-#include "include/common.h"
+#include "../common.h"
 
 __host__ __device__ int iDivUp(int, int);
 extern "C" void CublasSafeCall(cublasStatus_t);
 extern "C" void CusparseSafeCall(cusparseStatus_t err);
+extern "C" void CufftSafeCall(cufftResult err);
 extern "C" void GpuErrorCheck(cudaError_t ans);
 
 #endif
diff --git a/src/nbodyfft.cu b/src/kernels/nbodyfft.cu
similarity index 92%
rename from src/nbodyfft.cu
rename to src/kernels/nbodyfft.cu
index 177116e..21adb08 100644
--- a/src/nbodyfft.cu
+++ b/src/kernels/nbodyfft.cu
@@ -1,58 +1,4 @@
-#include "include/nbodyfft.h"
-
-#define cufftSafeCall(err)  __cufftSafeCall(err, __FILE__, __LINE__)
-
-clock_t _nbody_fft_timer;
-float _nbody_times[10];
-
-#define _ntime(x) _nbody_times[x] += ( (float) clock() - _nbody_fft_timer ) / CLOCKS_PER_SEC; _nbody_fft_timer = clock();
-
-static const char *_cudaGetErrorEnum(cufftResult error)
-{
-    switch (error)
-    {
-        case CUFFT_SUCCESS:
-            return "CUFFT_SUCCESS";
-
-        case CUFFT_INVALID_PLAN:
-            return "CUFFT_INVALID_PLAN";
-
-        case CUFFT_ALLOC_FAILED:
-            return "CUFFT_ALLOC_FAILED";
-
-        case CUFFT_INVALID_TYPE:
-            return "CUFFT_INVALID_TYPE";
-
-        case CUFFT_INVALID_VALUE:
-            return "CUFFT_INVALID_VALUE";
-
-        case CUFFT_INTERNAL_ERROR:
-            return "CUFFT_INTERNAL_ERROR";
-
-        case CUFFT_EXEC_FAILED:
-            return "CUFFT_EXEC_FAILED";
-
-        case CUFFT_SETUP_FAILED:
-            return "CUFFT_SETUP_FAILED";
-
-        case CUFFT_INVALID_SIZE:
-            return "CUFFT_INVALID_SIZE";
-
-        case CUFFT_UNALIGNED_DATA:
-            return "CUFFT_UNALIGNED_DATA";
-    }
-
-    return "<unknown>";
-}
-
-inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
-{
-    if( CUFFT_SUCCESS != err) {
-		fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \
-									_cudaGetErrorEnum(err)); \
-		cudaDeviceReset(); assert(0); \
-    }
-}
+#include "../include/kernels/nbodyfft.h"
 
 __global__ void copy_to_fft_input(volatile float * __restrict__ fft_input,
                                   const float * w_coefficients_device,
diff --git a/src/util/cuda_utils.cu b/src/util/cuda_utils.cu
index 14d194a..78edaf4 100644
--- a/src/util/cuda_utils.cu
+++ b/src/util/cuda_utils.cu
@@ -1,18 +1,18 @@
 /**
  * @brief Utility/Error Checking code.
- * 
+ *
  * @file cuda_utils.cu
  * @date 2018-04-04
- * Copyright (C) 2012-2017 Orange Owl Solutions.  
+ * Copyright (C) 2012-2017 Orange Owl Solutions.
  */
 
 
  /*
  CUDA Utilities - Utilities for high performance CPUs/C/C++ and GPUs/CUDA computing library.
-    
-    Copyright (C) 2012-2017 Orange Owl Solutions.  
 
-    
+    Copyright (C) 2012-2017 Orange Owl Solutions.
+
+
     CUDA Utilities is free software: you can redistribute it and/or modify
     it under the terms of the Lesser GNU General Public License as published by
     the Free Software Foundation, either version 3 of the License, or
@@ -31,7 +31,7 @@
     or send an e-mail to: info@orangeowlsolutions.com
 */
 
-#include "include/util/cuda_utils.h"
+#include "../include/util/cuda_utils.h"
 
 /*******************/
 /* iDivUp FUNCTION */
@@ -159,5 +159,84 @@ inline void __CusparseSafeCall(cusparseStatus_t err, const char *file, const int
 
 extern "C" void CusparseSafeCall(cusparseStatus_t err) { __CusparseSafeCall(err, __FILE__, __LINE__); }
 
+/************************/
+/* CUFFT ERROR CHECKING */
+/************************/
+
+static const char *_cufftGetErrorEnum(cufftResult error)
+{
+    switch (error)
+    {
+        case CUFFT_SUCCESS:
+            return "CUFFT_SUCCESS";
+
+        case CUFFT_INVALID_PLAN:
+            return "CUFFT_INVALID_PLAN";
+
+        case CUFFT_ALLOC_FAILED:
+            return "CUFFT_ALLOC_FAILED";
+
+        case CUFFT_INVALID_TYPE:
+            return "CUFFT_INVALID_TYPE";
+
+        case CUFFT_INVALID_VALUE:
+            return "CUFFT_INVALID_VALUE";
+
+        case CUFFT_INTERNAL_ERROR:
+            return "CUFFT_INTERNAL_ERROR";
+
+        case CUFFT_EXEC_FAILED:
+            return "CUFFT_EXEC_FAILED";
+
+        case CUFFT_SETUP_FAILED:
+            return "CUFFT_SETUP_FAILED";
+
+        case CUFFT_INVALID_SIZE:
+            return "CUFFT_INVALID_SIZE";
+
+        case CUFFT_UNALIGNED_DATA:
+            return "CUFFT_UNALIGNED_DATA";
+
+        case CUFFT_INVALID_DEVICE:
+            return "CUFFT_INVALID_DEVICE";
+
+        case CUFFT_INCOMPLETE_PARAMETER_LIST:
+            return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+        case CUFFT_PARSE_ERROR:
+            return "CUFFT_PARSE_ERROR";
+
+        case CUFFT_NO_WORKSPACE:
+            return "CUFFT_NO_WORKSPACE";
+
+        case CUFFT_NOT_SUPPORTED:
+            return "CUFFT_NOT_SUPPORTED";
+
+        case CUFFT_NOT_IMPLEMENTED:
+            return "CUFFT_NOT_IMPLEMENTED";
+
+        case CUFFT_LICENSE_ERROR:
+            return "CUFFT_LICENSE_ERROR";
+
+        default:
+            return "<unknown>";
+    }
+}
+
+inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
+{
+    if( CUFFT_SUCCESS != err) {
+		fprintf(stderr,
+                "CUFFT error in file '%s', line %d, error %s\nterminating!\n",
+                __FILE__,
+                __LINE__,
+				_cufftGetErrorEnum(err));
+		cudaDeviceReset();
+        assert(0);
+    }
+}
+
+extern "C" void CufftSafeCall(cufftResult err) { __cufftSafeCall(err, __FILE__, __LINE__); }
+
 
-/// END OF CUDA UTILITIES
\ No newline at end of file
+/// END OF CUDA UTILITIES
diff --git a/src/util/data_utils.cu b/src/util/data_utils.cu
index 19fb93a..16d2fc7 100644
--- a/src/util/data_utils.cu
+++ b/src/util/data_utils.cu
@@ -1,13 +1,13 @@
 /**
  * @brief Implementation file of the data_utils.h header
- * 
+ *
  * @file data_utils.cu
  * @author your name
  * @date 2018-05-05
  * Copyright (c) 2018, Regents of the University of Californias
  */
 
-#include "include/util/data_utils.h"
+#include "../include/util/data_utils.h"
 
 float* tsnecuda::util::LoadMnist(std::string file_name, int32_t& num_images,
         int32_t& num_rows, int32_t& num_columns) {
@@ -92,7 +92,7 @@ float* tsnecuda::util::LoadCifar10(std::string file_path) {
     for (size_t i = 0; i < 5; i++) {
         char binary_file_name[50];
         snprintf(binary_file_name, sizeof(binary_file_name),
-            "/data_batch_%d.bin", i + 1);
+            "/data_batch_%zu.bin", i + 1);
 
         std::string file_name = file_path;
         file_name.append(binary_file_name);

From 2a6402efac866d950d8fab146c6b32bbbc6e40d8 Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 17:24:59 -0700
Subject: [PATCH 06/14] create rep_forces.cu file

---
 CMakeLists.txt                    |   1 +
 src/fit_tsne.cu                   | 106 ++----------------------------
 src/include/fit_tsne.h            |   1 +
 src/include/kernels/attr_forces.h |   4 +-
 src/include/kernels/rep_forces.h  |  27 ++++++++
 src/kernels/rep_forces.cu         |  94 ++++++++++++++++++++++++++
 6 files changed, 129 insertions(+), 104 deletions(-)
 create mode 100644 src/include/kernels/rep_forces.h
 create mode 100644 src/kernels/rep_forces.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c609763..e91cf0e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -151,6 +151,7 @@ set(SOURCES
     # Kernels
     src/kernels/apply_forces.cu
     src/kernels/attr_forces.cu
+    src/kernels/rep_forces.cu
     src/kernels/perplexity_search.cu
     src/kernels/nbodyfft.cu
 
diff --git a/src/fit_tsne.cu b/src/fit_tsne.cu
index 6bec506..472905e 100644
--- a/src/fit_tsne.cu
+++ b/src/fit_tsne.cu
@@ -9,7 +9,6 @@
 #define END_IL_TIMER(x) stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start); x += duration; total_time += duration;
 #define PRINT_IL_TIMER(x) std::cout << #x << ": " << ((float) x.count()) / 1000000.0 << "s" << std::endl
 
-
 float squared_cauchy_2d(float x1, float x2, float y1, float y2) {
     return pow(1.0 + pow(x1 - y1, 2) + pow(x2 - y2, 2), -2);
 }
@@ -59,98 +58,6 @@ struct minmax_binary_op
   }
 };
 
-__global__ void compute_repulsive_forces_kernel(
-    volatile float * __restrict__ repulsive_forces_device,
-    volatile float * __restrict__ normalization_vec_device,
-    const float * const xs,
-    const float * const ys,
-    const float * const potentialsQij,
-    const int num_points,
-    const int num_nodes,
-    const int n_terms)
-{
-    register int TID = threadIdx.x + blockIdx.x * blockDim.x;
-    if (TID >= num_points)
-        return;
-
-    register float phi1, phi2, phi3, phi4, x_pt, y_pt;
-
-    phi1 = potentialsQij[TID * n_terms + 0];
-    phi2 = potentialsQij[TID * n_terms + 1];
-    phi3 = potentialsQij[TID * n_terms + 2];
-    phi4 = potentialsQij[TID * n_terms + 3];
-
-    x_pt = xs[TID];
-    y_pt = ys[TID];
-
-    normalization_vec_device[TID] =
-        (1 + x_pt * x_pt + y_pt * y_pt) * phi1 - 2 * (x_pt * phi2 + y_pt * phi3) + phi4;
-
-    repulsive_forces_device[TID] = x_pt * phi1 - phi2;
-    repulsive_forces_device[TID + num_nodes + 1] = y_pt * phi1 - phi3;
-}
-
-float compute_repulsive_forces(
-    thrust::device_vector<float> &repulsive_forces_device,
-    thrust::device_vector<float> &normalization_vec_device,
-    thrust::device_vector<float> &points_device,
-    thrust::device_vector<float> &potentialsQij,
-    const int num_points,
-    const int num_nodes,
-    const int n_terms)
-{
-    const int num_threads = 1024;
-    const int num_blocks = (num_points + num_threads - 1) / num_threads;
-    compute_repulsive_forces_kernel<<<num_blocks, num_threads>>>(
-        thrust::raw_pointer_cast(repulsive_forces_device.data()),
-        thrust::raw_pointer_cast(normalization_vec_device.data()),
-        thrust::raw_pointer_cast(points_device.data()),
-        thrust::raw_pointer_cast(points_device.data() + num_nodes + 1),
-        thrust::raw_pointer_cast(potentialsQij.data()),
-        num_points, num_nodes, n_terms);
-    float sumQ = thrust::reduce(
-        normalization_vec_device.begin(), normalization_vec_device.end(), 0,
-        thrust::plus<float>());
-    return sumQ - num_points;
-}
-
-__global__ void compute_chargesQij_kernel(
-    volatile float * __restrict__ chargesQij,
-    const float * const xs,
-    const float * const ys,
-    const int num_points,
-    const int n_terms)
-{
-    register int TID = threadIdx.x + blockIdx.x * blockDim.x;
-    if (TID >= num_points)
-        return;
-
-    register float x_pt, y_pt;
-    x_pt = xs[TID];
-    y_pt = ys[TID];
-
-    chargesQij[TID * n_terms + 0] = 1;
-    chargesQij[TID * n_terms + 1] = x_pt;
-    chargesQij[TID * n_terms + 2] = y_pt;
-    chargesQij[TID * n_terms + 3] = x_pt * x_pt + y_pt * y_pt;
-}
-
-void compute_chargesQij(
-    thrust::device_vector<float> &chargesQij,
-    thrust::device_vector<float> &points_device,
-    const int num_points,
-    const int num_nodes,
-    const int n_terms)
-{
-    const int num_threads = 1024;
-    const int num_blocks = (num_points + num_threads - 1) / num_threads;
-    compute_chargesQij_kernel<<<num_blocks, num_threads>>>(
-        thrust::raw_pointer_cast(chargesQij.data()),
-        thrust::raw_pointer_cast(points_device.data()),
-        thrust::raw_pointer_cast(points_device.data() + num_nodes + 1),
-        num_points, n_terms);
-}
-
 void tsnecuda::RunTsne(tsnecuda::Options &opt,
                        tsnecuda::GpuOptions &gpu_opt)
 {
@@ -495,7 +402,7 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
 
         // Prepare the terms that we'll use to compute the sum i.e. the repulsive forces
         START_IL_TIMER();
-        compute_chargesQij(chargesQij_device, points_device, num_points, num_points - 1, n_terms);
+        tsnecuda::ComputeChargesQij(chargesQij_device, points_device, num_points, num_points - 1, n_terms);
         END_IL_TIMER(_time_compute_charges);
 
         // Compute Minimax elements
@@ -538,7 +445,7 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
         // different ones. We subtract N at the end because the following sums over all i, j, whereas Z contains i \neq j
 
         // Make the negative term, or F_rep in the equation 3 of the paper
-        normalization = compute_repulsive_forces(
+        normalization = tsnecuda::ComputeRepulsiveForces(
             repulsive_forces_device, normalization_vec_device, points_device,
             potentialsQij_device, num_points, num_points - 1, n_terms);
 
@@ -578,15 +485,12 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
                                   num_points - 1,
                                   num_points,
                                   num_blocks);
-        // Add a bit of random motion to prevent points from being on top of each other
-        // thrust::transform(points_device.begin(), points_device.end(), random_vector_device.begin(),
-        //                     points_device.begin(), thrust::plus<float>());
 
         // // Compute the gradient norm
         // tsnecuda::util::SquareDeviceVector(attractive_forces_device, old_forces_device);
         // thrust::transform(attractive_forces_device.begin(), attractive_forces_device.begin()+num_points,
-        //                   attractive_forces_device.begin()+num_points, attractive_forces_device.begin(),
-        //                   thrust::plus<float>());
+                          // attractive_forces_device.begin()+num_points, attractive_forces_device.begin(),
+                          // thrust::plus<float>());
         // tsnecuda::util::SqrtDeviceVector(attractive_forces_device, attractive_forces_device);
         // float grad_norm = thrust::reduce(
         //     attractive_forces_device.begin(), attractive_forces_device.begin() + num_points,
@@ -655,8 +559,6 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
         PRINT_IL_TIMER(_time_apply_forces);
         PRINT_IL_TIMER(_time_other);
         PRINT_IL_TIMER(total_time);
-
-        // std::cout << "Total Time: " << ((float) total_time.count()) / 1000000.0  << std::endl;
     }
 
 
diff --git a/src/include/fit_tsne.h b/src/include/fit_tsne.h
index 61fe542..5eeea42 100644
--- a/src/include/fit_tsne.h
+++ b/src/include/fit_tsne.h
@@ -24,6 +24,7 @@
 #include "include/kernels/attr_forces.h"
 #include "include/kernels/perplexity_search.h"
 #include "include/kernels/nbodyfft.h"
+#include "include/kernels/rep_forces.h"
 
 namespace tsnecuda {
 void RunTsne(tsnecuda::Options &opt, tsnecuda::GpuOptions &gpu_opt);
diff --git a/src/include/kernels/attr_forces.h b/src/include/kernels/attr_forces.h
index 59e3404..94f8c29 100644
--- a/src/include/kernels/attr_forces.h
+++ b/src/include/kernels/attr_forces.h
@@ -6,8 +6,8 @@
  * @date 2018-05-08
  * Copyright (c) 2018, Regents of the University of California
  */
-#ifndef SRC_INCLUDE_KERNELS_BH_ATTR_FORCES_H_
-#define SRC_INCLUDE_KERNELS_BH_ATTR_FORCES_H_
+#ifndef SRC_INCLUDE_KERNELS_ATTR_FORCES_H_
+#define SRC_INCLUDE_KERNELS_ATTR_FORCES_H_
 
 #include "../common.h"
 #include "../options.h"
diff --git a/src/include/kernels/rep_forces.h b/src/include/kernels/rep_forces.h
new file mode 100644
index 0000000..5c0e61e
--- /dev/null
+++ b/src/include/kernels/rep_forces.h
@@ -0,0 +1,27 @@
+#ifndef SRC_INCLUDE_KERNELS_REP_FORCES_H_
+#define SRC_INCLUDE_KERNELS_REP_FORCES_H_
+
+#include "../common.h"
+#include "../options.h"
+#include "../util/cuda_utils.h"
+
+namespace tsnecuda {
+
+float ComputeRepulsiveForces(
+    thrust::device_vector<float> &repulsive_forces_device,
+    thrust::device_vector<float> &normalization_vec_device,
+    thrust::device_vector<float> &points_device,
+    thrust::device_vector<float> &potentialsQij,
+    const int num_points,
+    const int num_nodes,
+    const int n_terms);
+
+void ComputeChargesQij(
+    thrust::device_vector<float> &chargesQij,
+    thrust::device_vector<float> &points_device,
+    const int num_points,
+    const int num_nodes,
+    const int n_terms);
+}
+
+#endif
diff --git a/src/kernels/rep_forces.cu b/src/kernels/rep_forces.cu
new file mode 100644
index 0000000..6accb35
--- /dev/null
+++ b/src/kernels/rep_forces.cu
@@ -0,0 +1,94 @@
+#include "../include/kernels/rep_forces.h"
+
+
+__global__ void compute_repulsive_forces_kernel(
+    volatile float * __restrict__ repulsive_forces_device,
+    volatile float * __restrict__ normalization_vec_device,
+    const float * const xs,
+    const float * const ys,
+    const float * const potentialsQij,
+    const int num_points,
+    const int num_nodes,
+    const int n_terms)
+{
+    register int TID = threadIdx.x + blockIdx.x * blockDim.x;
+    if (TID >= num_points)
+        return;
+
+    register float phi1, phi2, phi3, phi4, x_pt, y_pt;
+
+    phi1 = potentialsQij[TID * n_terms + 0];
+    phi2 = potentialsQij[TID * n_terms + 1];
+    phi3 = potentialsQij[TID * n_terms + 2];
+    phi4 = potentialsQij[TID * n_terms + 3];
+
+    x_pt = xs[TID];
+    y_pt = ys[TID];
+
+    normalization_vec_device[TID] =
+        (1 + x_pt * x_pt + y_pt * y_pt) * phi1 - 2 * (x_pt * phi2 + y_pt * phi3) + phi4;
+
+    repulsive_forces_device[TID] = x_pt * phi1 - phi2;
+    repulsive_forces_device[TID + num_nodes + 1] = y_pt * phi1 - phi3;
+}
+
+float tsnecuda::ComputeRepulsiveForces(
+    thrust::device_vector<float> &repulsive_forces_device,
+    thrust::device_vector<float> &normalization_vec_device,
+    thrust::device_vector<float> &points_device,
+    thrust::device_vector<float> &potentialsQij,
+    const int num_points,
+    const int num_nodes,
+    const int n_terms)
+{
+    const int num_threads = 1024;
+    const int num_blocks = (num_points + num_threads - 1) / num_threads;
+    compute_repulsive_forces_kernel<<<num_blocks, num_threads>>>(
+        thrust::raw_pointer_cast(repulsive_forces_device.data()),
+        thrust::raw_pointer_cast(normalization_vec_device.data()),
+        thrust::raw_pointer_cast(points_device.data()),
+        thrust::raw_pointer_cast(points_device.data() + num_nodes + 1),
+        thrust::raw_pointer_cast(potentialsQij.data()),
+        num_points, num_nodes, n_terms);
+    float sumQ = thrust::reduce(
+        normalization_vec_device.begin(), normalization_vec_device.end(), 0,
+        thrust::plus<float>());
+    return sumQ - num_points;
+}
+
+__global__ void compute_chargesQij_kernel(
+    volatile float * __restrict__ chargesQij,
+    const float * const xs,
+    const float * const ys,
+    const int num_points,
+    const int n_terms)
+{
+    register int TID = threadIdx.x + blockIdx.x * blockDim.x;
+    if (TID >= num_points)
+        return;
+
+    register float x_pt, y_pt;
+    x_pt = xs[TID];
+    y_pt = ys[TID];
+
+    chargesQij[TID * n_terms + 0] = 1;
+    chargesQij[TID * n_terms + 1] = x_pt;
+    chargesQij[TID * n_terms + 2] = y_pt;
+    chargesQij[TID * n_terms + 3] = x_pt * x_pt + y_pt * y_pt;
+}
+
+void tsnecuda::ComputeChargesQij(
+    thrust::device_vector<float> &chargesQij,
+    thrust::device_vector<float> &points_device,
+    const int num_points,
+    const int num_nodes,
+    const int n_terms)
+{
+    const int num_threads = 1024;
+    const int num_blocks = (num_points + num_threads - 1) / num_threads;
+    compute_chargesQij_kernel<<<num_blocks, num_threads>>>(
+        thrust::raw_pointer_cast(chargesQij.data()),
+        thrust::raw_pointer_cast(points_device.data()),
+        thrust::raw_pointer_cast(points_device.data() + num_nodes + 1),
+        num_points, n_terms);
+}

From 1c0d8fbf2e17b207880a8a2963e0c2efb98ec5b3 Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 17:27:04 -0700
Subject: [PATCH 07/14] delete unused minmax

---
 src/fit_tsne.cu | 66 ++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/src/fit_tsne.cu b/src/fit_tsne.cu
index 472905e..921edcb 100644
--- a/src/fit_tsne.cu
+++ b/src/fit_tsne.cu
@@ -16,48 +16,48 @@ float squared_cauchy_2d(float x1, float x2, float y1, float y2) {
 
 // minmax_pair stores the minimum and maximum
 // values that have been encountered so far
-template <typename T>
-struct minmax_pair
-{
-  T min_val;
-  T max_val;
-};
+// template <typename T>
+// struct minmax_pair
+// {
+  // T min_val;
+  // T max_val;
+// };
 
 // minmax_unary_op is a functor that takes in a value x and
 // returns a minmax_pair whose minimum and maximum values
 // are initialized to x.
-template <typename T>
-struct minmax_unary_op
-  : public thrust::unary_function< T, minmax_pair<T> >
-{
-  __host__ __device__
-  minmax_pair<T> operator()(const T& x) const
-  {
-    minmax_pair<T> result;
-    result.min_val = x;
-    result.max_val = x;
-    return result;
-  }
-};
+// template <typename T>
+// struct minmax_unary_op
+  // : public thrust::unary_function< T, minmax_pair<T> >
+// {
+  // __host__ __device__
+  // minmax_pair<T> operator()(const T& x) const
+  // {
+    // minmax_pair<T> result;
+    // result.min_val = x;
+    // result.max_val = x;
+    // return result;
+  // }
+// };
 
 // minmax_binary_op is a functor that accepts two minmax_pair
 // structs and returns a new minmax_pair whose minimum and
 // maximum values are the min() and max() respectively of
 // the minimums and maximums of the input pairs
-template <typename T>
-struct minmax_binary_op
-  : public thrust::binary_function< minmax_pair<T>, minmax_pair<T>, minmax_pair<T> >
-{
-  __host__ __device__
-  minmax_pair<T> operator()(const minmax_pair<T>& x, const minmax_pair<T>& y) const
-  {
-    minmax_pair<T> result;
-    result.min_val = thrust::min(x.min_val, y.min_val);
-    result.max_val = thrust::max(x.max_val, y.max_val);
-    return result;
-  }
-};
-
+// template <typename T>
+// struct minmax_binary_op
+  // : public thrust::binary_function< minmax_pair<T>, minmax_pair<T>, minmax_pair<T> >
+// {
+  // __host__ __device__
+  // minmax_pair<T> operator()(const minmax_pair<T>& x, const minmax_pair<T>& y) const
+  // {
+    // minmax_pair<T> result;
+    // result.min_val = thrust::min(x.min_val, y.min_val);
+    // result.max_val = thrust::max(x.max_val, y.max_val);
+    // return result;
+  // }
+// };
+//
 void tsnecuda::RunTsne(tsnecuda::Options &opt,
                        tsnecuda::GpuOptions &gpu_opt)
 {

From 86b2695ada0d1202734dd682d07422e9fb266193 Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 17:27:41 -0700
Subject: [PATCH 08/14] delete squared_cauchy_2d - this is on GPU  now

---
 src/fit_tsne.cu | 49 -------------------------------------------------
 1 file changed, 49 deletions(-)

diff --git a/src/fit_tsne.cu b/src/fit_tsne.cu
index 921edcb..f89845f 100644
--- a/src/fit_tsne.cu
+++ b/src/fit_tsne.cu
@@ -9,55 +9,6 @@
 #define END_IL_TIMER(x) stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start); x += duration; total_time += duration;
 #define PRINT_IL_TIMER(x) std::cout << #x << ": " << ((float) x.count()) / 1000000.0 << "s" << std::endl
 
-float squared_cauchy_2d(float x1, float x2, float y1, float y2) {
-    return pow(1.0 + pow(x1 - y1, 2) + pow(x2 - y2, 2), -2);
-}
-// compute minimum and maximum values in a single reduction
-
-// minmax_pair stores the minimum and maximum
-// values that have been encountered so far
-// template <typename T>
-// struct minmax_pair
-// {
-  // T min_val;
-  // T max_val;
-// };
-
-// minmax_unary_op is a functor that takes in a value x and
-// returns a minmax_pair whose minimum and maximum values
-// are initialized to x.
-// template <typename T>
-// struct minmax_unary_op
-  // : public thrust::unary_function< T, minmax_pair<T> >
-// {
-  // __host__ __device__
-  // minmax_pair<T> operator()(const T& x) const
-  // {
-    // minmax_pair<T> result;
-    // result.min_val = x;
-    // result.max_val = x;
-    // return result;
-  // }
-// };
-
-// minmax_binary_op is a functor that accepts two minmax_pair
-// structs and returns a new minmax_pair whose minimum and
-// maximum values are the min() and max() respectively of
-// the minimums and maximums of the input pairs
-// template <typename T>
-// struct minmax_binary_op
-  // : public thrust::binary_function< minmax_pair<T>, minmax_pair<T>, minmax_pair<T> >
-// {
-  // __host__ __device__
-  // minmax_pair<T> operator()(const minmax_pair<T>& x, const minmax_pair<T>& y) const
-  // {
-    // minmax_pair<T> result;
-    // result.min_val = thrust::min(x.min_val, y.min_val);
-    // result.max_val = thrust::max(x.max_val, y.max_val);
-    // return result;
-  // }
-// };
-//
 void tsnecuda::RunTsne(tsnecuda::Options &opt,
                        tsnecuda::GpuOptions &gpu_opt)
 {

From 092fe6b771c45ce90aab740ee7e6065374755678 Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 17:36:00 -0700
Subject: [PATCH 09/14] re-add support for snapshotting

---
 src/fit_tsne.cu | 37 +++++++++++++++----------------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/src/fit_tsne.cu b/src/fit_tsne.cu
index f89845f..52f8284 100644
--- a/src/fit_tsne.cu
+++ b/src/fit_tsne.cu
@@ -55,6 +55,10 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
 
     // Setup some return information if we're working on snapshots
     int snap_num = 0;
+    int snap_interval = 1;
+    if (opt.return_style == tsnecuda::RETURN_STYLE::SNAPSHOT) {
+        snap_interval = opt.iterations / (opt.num_snapshots - 1);
+    }
 
     // Get constants from options
     const int num_points = opt.num_points;
@@ -68,23 +72,16 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
     float attr_exaggeration = opt.early_exaggeration;
     float normalization;
 
-    // Figure out number of nodes needed for BH tree
-    int nnodes = num_points * 2;
-    if (nnodes < 1024 * num_blocks) nnodes = 1024 * num_blocks;
-    while ((nnodes & (gpu_opt.warp_size - 1)) != 0)
-        nnodes++;
-    nnodes--;
-    const int num_nodes = nnodes;
-    opt.num_nodes = num_nodes;
-
     // Allocate host memory
     float *knn_squared_distances = new float[num_points * num_neighbors];
     memset(knn_squared_distances, 0, num_points * num_neighbors * sizeof(float));
     long *knn_indices = new long[num_points * num_neighbors];
 
-    // Initialize global variables
-    thrust::device_vector<int> err_device(1);
-    // tsnecuda::Initialize(gpu_opt, err_device);
+    // Set cache configs
+    cudaFuncSetCacheConfig(tsnecuda::IntegrationKernel, cudaFuncCachePreferL1);
+    cudaFuncSetCacheConfig(tsnecuda::ComputePijxQijKernel, cudaFuncCachePreferShared);
+    GpuErrorCheck(cudaDeviceSynchronize());
+
 
     END_IL_TIMER(_time_initialization);
     START_IL_TIMER();
@@ -330,11 +327,7 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
             std::cout << "This version is not built with ZMQ for interative viz. Rebuild with WITH_ZMQ=TRUE for viz." << std::endl;
     #endif
 
-
-
     END_IL_TIMER(_time_init_fft);
-
-
     // Support for infinite iteration
     for (size_t step = 0; step != opt.iterations; step++) {
 
@@ -483,12 +476,12 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
         }
 
         // // Handle snapshoting
-        // if (opt.return_style == tsnecuda::RETURN_STYLE::SNAPSHOT && step % snap_interval == 0 && opt.return_data != nullptr) {
-        //   thrust::copy(points_device.begin(),
-        //                points_device.end(),
-        //                snap_num*opt.num_points*2 + opt.return_data);
-        //   snap_num += 1;
-        // }
+        if (opt.return_style == tsnecuda::RETURN_STYLE::SNAPSHOT && step % snap_interval == 0 && opt.return_data != nullptr) {
+          thrust::copy(points_device.begin(),
+                       points_device.end(),
+                       snap_num*opt.num_points*2 + opt.return_data);
+          snap_num += 1;
+        }
 
     }
 

From f1d7ee766032b71e9d58a4de650a39e3d9e9f657 Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 17:40:48 -0700
Subject: [PATCH 10/14] re-add support for gradient norms

---
 src/fit_tsne.cu | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/src/fit_tsne.cu b/src/fit_tsne.cu
index 52f8284..1af3ef0 100644
--- a/src/fit_tsne.cu
+++ b/src/fit_tsne.cu
@@ -384,10 +384,6 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
         END_IL_TIMER(_time_nbodyfft);
         START_IL_TIMER();
 
-        // Compute the normalization constant Z or sum of q_{ij}. This expression is different from the one in the original
-        // paper, but equivalent. This is done so we need only use a single kernel (K_2 in the paper) instead of two
-        // different ones. We subtract N at the end because the following sums over all i, j, whereas Z contains i \neq j
-
         // Make the negative term, or F_rep in the equation 3 of the paper
         normalization = tsnecuda::ComputeRepulsiveForces(
             repulsive_forces_device, normalization_vec_device, points_device,
@@ -431,24 +427,24 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt,
                                   num_blocks);
 
         // // Compute the gradient norm
-        // tsnecuda::util::SquareDeviceVector(attractive_forces_device, old_forces_device);
-        // thrust::transform(attractive_forces_device.begin(), attractive_forces_device.begin()+num_points,
-                          // attractive_forces_device.begin()+num_points, attractive_forces_device.begin(),
-                          // thrust::plus<float>());
-        // tsnecuda::util::SqrtDeviceVector(attractive_forces_device, attractive_forces_device);
-        // float grad_norm = thrust::reduce(
-        //     attractive_forces_device.begin(), attractive_forces_device.begin() + num_points,
-        //     0.0f, thrust::plus<float>()) / num_points;
-        // thrust::fill(attractive_forces_device.begin(), attractive_forces_device.end(), 0.0f);
-
-        // if (grad_norm < opt.min_gradient_norm) {
-        //     if (opt.verbosity >= 1) std::cout << "Reached minimum gradient norm: " << grad_norm << std::endl;
-        //     break;
-        // }
-
-        // if (opt.verbosity >= 1 && step % opt.print_interval == 0) {
-        //     std::cout << "[Step " << step << "] Avg. Gradient Norm: " << grad_norm << std::endl;
-        // }
+        tsnecuda::util::SquareDeviceVector(attractive_forces_device, old_forces_device);
+        thrust::transform(attractive_forces_device.begin(), attractive_forces_device.begin()+num_points,
+                          attractive_forces_device.begin()+num_points, attractive_forces_device.begin(),
+                          thrust::plus<float>());
+        tsnecuda::util::SqrtDeviceVector(attractive_forces_device, attractive_forces_device);
+        float grad_norm = thrust::reduce(
+            attractive_forces_device.begin(), attractive_forces_device.begin() + num_points,
+            0.0f, thrust::plus<float>()) / num_points;
+        thrust::fill(attractive_forces_device.begin(), attractive_forces_device.end(), 0.0f);
+
+        if (grad_norm < opt.min_gradient_norm) {
+            if (opt.verbosity >= 1) std::cout << "Reached minimum gradient norm: " << grad_norm << std::endl;
+            break;
+        }
+
+        if (opt.verbosity >= 1 && step % opt.print_interval == 0) {
+            std::cout << "[Step " << step << "] Avg. Gradient Norm: " << grad_norm << std::endl;
+        }
 
         END_IL_TIMER(_time_apply_forces);
 

From c98a9f7d1f19b2a6dbaebf5e025788d92e929589 Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 17:45:38 -0700
Subject: [PATCH 11/14] re-add compute architectures

---
 CMakeLists.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e91cf0e..16bd30e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,13 +58,13 @@ SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};
                     -O3
                     -Xptxas -dlcm=cg
-                    # -gencode=arch=compute_30,code=sm_30
-                    # -gencode=arch=compute_35,code=sm_35
-                    # -gencode=arch=compute_50,code=sm_50
-                    # -gencode=arch=compute_52,code=sm_52
-                    # -gencode=arch=compute_60,code=sm_60
-		             -gencode=arch=compute_61,code=sm_61
-                    # -gencode=arch=compute_70,code=sm_70
+                    -gencode=arch=compute_30,code=sm_30
+                    -gencode=arch=compute_35,code=sm_35
+                    -gencode=arch=compute_50,code=sm_50
+                    -gencode=arch=compute_52,code=sm_52
+                    -gencode=arch=compute_60,code=sm_60
+		            -gencode=arch=compute_61,code=sm_61
+                    -gencode=arch=compute_70,code=sm_70
                     -std=c++11
                     -Xcompiler '-O3'
                     -Xcompiler '-fPIC'

From 9da214f28f038c2ca696f402764dcfddefdfebd5 Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 18:16:09 -0700
Subject: [PATCH 12/14] fix relative imports

---
 CMakeLists.txt                          | 12 ++++++------
 build.sh                                |  7 +++----
 meta.yaml                               |  4 ++--
 src/include/kernels/apply_forces.h      |  6 +++---
 src/include/kernels/attr_forces.h       |  6 +++---
 src/include/kernels/nbodyfft.h          |  6 +++---
 src/include/kernels/perplexity_search.h | 12 ++++++------
 src/include/kernels/rep_forces.h        |  6 +++---
 src/include/util/cuda_utils.h           |  2 +-
 src/kernels/apply_forces.cu             |  2 +-
 src/kernels/initialization.cu           | 13 -------------
 src/kernels/nbodyfft.cu                 |  2 +-
 src/kernels/rep_forces.cu               |  2 +-
 src/util/data_utils.cu                  |  2 +-
 14 files changed, 34 insertions(+), 48 deletions(-)
 delete mode 100644 src/kernels/initialization.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 16bd30e..20e04d7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,13 +58,13 @@ SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};
                     -O3
                     -Xptxas -dlcm=cg
-                    -gencode=arch=compute_30,code=sm_30
-                    -gencode=arch=compute_35,code=sm_35
-                    -gencode=arch=compute_50,code=sm_50
-                    -gencode=arch=compute_52,code=sm_52
-                    -gencode=arch=compute_60,code=sm_60
+                    # -gencode=arch=compute_30,code=sm_30
+                    # -gencode=arch=compute_35,code=sm_35
+                    # -gencode=arch=compute_50,code=sm_50
+                    # -gencode=arch=compute_52,code=sm_52
+                    # -gencode=arch=compute_60,code=sm_60
 		            -gencode=arch=compute_61,code=sm_61
-                    -gencode=arch=compute_70,code=sm_70
+                    # -gencode=arch=compute_70,code=sm_70
                     -std=c++11
                     -Xcompiler '-O3'
                     -Xcompiler '-fPIC'
diff --git a/build.sh b/build.sh
index b4e85cd..b930e39 100755
--- a/build.sh
+++ b/build.sh
@@ -1,11 +1,10 @@
-export PATH=/usr/local/cuda-9.0/bin:$PATH
-export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH
+# export PATH=/usr/local/cuda-9.0/bin:$PATH
+# export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH
 git submodule init
-git submodule update 
+git submodule update
 cd ./build
 cmake  .. -DBUILD_PYTHON=TRUE -DWITH_MKL=FALSE -DCMAKE_C_COMPILER=gcc-4.9 -DCMAKE_CXX_COMPILER=g++-4.9
 pwd
-make -j10
 make
 cd python/
 pwd
diff --git a/meta.yaml b/meta.yaml
index 3b9437c..3180603 100644
--- a/meta.yaml
+++ b/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "tsnecuda" %}
-{% set version = "0.1.1" %}
+{% set version = "0.2.1" %}
 
 package:
   name: '{{ name|lower }}'
@@ -8,7 +8,7 @@ package:
 source:
   git_url: https://github.com/CannyLab/tsne-cuda.git
   git_rev: HEAD
-  
+
 requirements:
   build:
     - python
diff --git a/src/include/kernels/apply_forces.h b/src/include/kernels/apply_forces.h
index 9aeb815..f755663 100644
--- a/src/include/kernels/apply_forces.h
+++ b/src/include/kernels/apply_forces.h
@@ -10,9 +10,9 @@
 #ifndef SRC_INCLUDE_KERNELS_APPLY_FORCES_H_
 #define SRC_INCLUDE_KERNELS_APPLY_FORCES_H_
 
-#include "../common.h"
-#include "../options.h"
-#include "../util/cuda_utils.h"
+#include "common.h"
+#include "options.h"
+#include "util/cuda_utils.h"
 
 namespace tsnecuda {
 __global__
diff --git a/src/include/kernels/attr_forces.h b/src/include/kernels/attr_forces.h
index 94f8c29..82e6116 100644
--- a/src/include/kernels/attr_forces.h
+++ b/src/include/kernels/attr_forces.h
@@ -9,9 +9,9 @@
 #ifndef SRC_INCLUDE_KERNELS_ATTR_FORCES_H_
 #define SRC_INCLUDE_KERNELS_ATTR_FORCES_H_
 
-#include "../common.h"
-#include "../options.h"
-#include "../util/cuda_utils.h"
+#include "common.h"
+#include "options.h"
+#include "util/cuda_utils.h"
 
 namespace tsnecuda {
 
diff --git a/src/include/kernels/nbodyfft.h b/src/include/kernels/nbodyfft.h
index a5ab5cb..73bcafb 100644
--- a/src/include/kernels/nbodyfft.h
+++ b/src/include/kernels/nbodyfft.h
@@ -4,9 +4,9 @@
 #include <complex>
 #include <cufft.h>
 #include <thrust/complex.h>
-#include "../common.h"
-#include "../util/cuda_utils.h"
-#include "../util/matrix_broadcast_utils.h"
+#include "common.h"
+#include "util/cuda_utils.h"
+#include "util/matrix_broadcast_utils.h"
 
 namespace tsnecuda {
 
diff --git a/src/include/kernels/perplexity_search.h b/src/include/kernels/perplexity_search.h
index 94d76e0..2ea7f1a 100644
--- a/src/include/kernels/perplexity_search.h
+++ b/src/include/kernels/perplexity_search.h
@@ -9,12 +9,12 @@
 #ifndef SRC_INCLUDE_KERNELS_PERPLEXITY_SEARCH_H_
 #define SRC_INCLUDE_KERNELS_PERPLEXITY_SEARCH_H_
 
-#include "../common.h"
-#include "../options.h"
-#include "../util/cuda_utils.h"
-#include "../util/reduce_utils.h"
-#include "../util/matrix_broadcast_utils.h"
-#include "../util/thrust_transform_functions.h"
+#include "common.h"
+#include "options.h"
+#include "util/cuda_utils.h"
+#include "util/reduce_utils.h"
+#include "util/matrix_broadcast_utils.h"
+#include "util/thrust_transform_functions.h"
 
 namespace tsnecuda {
 __global__
diff --git a/src/include/kernels/rep_forces.h b/src/include/kernels/rep_forces.h
index 5c0e61e..8c10cc6 100644
--- a/src/include/kernels/rep_forces.h
+++ b/src/include/kernels/rep_forces.h
@@ -1,9 +1,9 @@
 #ifndef SRC_INCLUDE_KERNELS_REP_FORCES_H_
 #define SRC_INCLUDE_KERNELS_REP_FORCES_H_
 
-#include "../common.h"
-#include "../options.h"
-#include "../util/cuda_utils.h"
+#include "common.h"
+#include "options.h"
+#include "util/cuda_utils.h"
 
 namespace tsnecuda {
 
diff --git a/src/include/util/cuda_utils.h b/src/include/util/cuda_utils.h
index 111a729..f535969 100644
--- a/src/include/util/cuda_utils.h
+++ b/src/include/util/cuda_utils.h
@@ -34,7 +34,7 @@
 #ifndef CUDA_UTILITIES_CUH
 #define CUDA_UTILITIES_CUH
 
-#include "../common.h"
+#include "common.h"
 
 __host__ __device__ int iDivUp(int, int);
 extern "C" void CublasSafeCall(cublasStatus_t);
diff --git a/src/kernels/apply_forces.cu b/src/kernels/apply_forces.cu
index 0160993..cbd295a 100644
--- a/src/kernels/apply_forces.cu
+++ b/src/kernels/apply_forces.cu
@@ -2,7 +2,7 @@
     Apply forces to the points with momentum, exaggeration, etc.
 */
 
-#include "../include/kernels/apply_forces.h"
+#include "include/kernels/apply_forces.h"
 
 
 /******************************************************************************/
diff --git a/src/kernels/initialization.cu b/src/kernels/initialization.cu
deleted file mode 100644
index b7d4259..0000000
--- a/src/kernels/initialization.cu
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "../include/kernels/initialization.h"
-
-/******************************************************************************/
-/*** initialize memory ********************************************************/
-/******************************************************************************/
-
-
-void tsnecuda::Initialize(tsnecuda::GpuOptions &gpu_opt, thrust::device_vector<int> &errd)
-{
-    cudaFuncSetCacheConfig(tsnecuda::bh::IntegrationKernel, cudaFuncCachePreferL1);
-    cudaFuncSetCacheConfig(tsnecuda::bh::ComputePijxQijKernel, cudaFuncCachePreferShared);
-    GpuErrorCheck(cudaDeviceSynchronize());
-}
diff --git a/src/kernels/nbodyfft.cu b/src/kernels/nbodyfft.cu
index 21adb08..5474517 100644
--- a/src/kernels/nbodyfft.cu
+++ b/src/kernels/nbodyfft.cu
@@ -1,4 +1,4 @@
-#include "../include/kernels/nbodyfft.h"
+#include "include/kernels/nbodyfft.h"
 
 __global__ void copy_to_fft_input(volatile float * __restrict__ fft_input,
                                   const float * w_coefficients_device,
diff --git a/src/kernels/rep_forces.cu b/src/kernels/rep_forces.cu
index 6accb35..7878db7 100644
--- a/src/kernels/rep_forces.cu
+++ b/src/kernels/rep_forces.cu
@@ -1,4 +1,4 @@
-#include "../include/kernels/rep_forces.h"
+#include "include/kernels/rep_forces.h"
 
 
 __global__ void compute_repulsive_forces_kernel(
diff --git a/src/util/data_utils.cu b/src/util/data_utils.cu
index 16d2fc7..3511ec0 100644
--- a/src/util/data_utils.cu
+++ b/src/util/data_utils.cu
@@ -7,7 +7,7 @@
  * Copyright (c) 2018, Regents of the University of Californias
  */
 
-#include "../include/util/data_utils.h"
+#include "include/util/data_utils.h"
 
 float* tsnecuda::util::LoadMnist(std::string file_name, int32_t& num_images,
         int32_t& num_rows, int32_t& num_columns) {

From 89518c23861cf4782e8a1d4bdeff0af128c43f9d Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 18:20:27 -0700
Subject: [PATCH 13/14] remove relative imports

---
 src/exe/main.cu                  | 8 ++------
 src/ext/pymodule_ext.cu          | 2 +-
 src/include/ext/pymodule_ext.h   | 8 ++++----
 src/kernels/attr_forces.cu       | 2 +-
 src/kernels/perplexity_search.cu | 2 +-
 src/util/cuda_utils.cu           | 2 +-
 6 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/src/exe/main.cu b/src/exe/main.cu
index a3432f1..39a5cdf 100644
--- a/src/exe/main.cu
+++ b/src/exe/main.cu
@@ -2,12 +2,8 @@
 // args, so we don't have to re-build to change options.
 
 // Detailed includes
-// #include "../include/common.h"
-#include "../include/util/data_utils.h"
-// #include "../include/util/cuda_utils.h"
-// #include "../include/util/random_utils.h"
-// #include "../include/util/distance_utils.h"
-#include "../include/fit_tsne.h"
+#include "include/util/data_utils.h"
+#include "include/fit_tsne.h"
 #include <time.h>
 #include <string>
 
diff --git a/src/ext/pymodule_ext.cu b/src/ext/pymodule_ext.cu
index 7c2dff3..9a768d7 100644
--- a/src/ext/pymodule_ext.cu
+++ b/src/ext/pymodule_ext.cu
@@ -1,7 +1,7 @@
 
 // Implementation file for the python extensions
 
-#include "../include/ext/pymodule_ext.h"
+#include "ext/pymodule_ext.h"
 
 void pymodule_bh_tsne(float *result,
                       float* points,
diff --git a/src/include/ext/pymodule_ext.h b/src/include/ext/pymodule_ext.h
index ef9f6e9..56bd7f6 100644
--- a/src/include/ext/pymodule_ext.h
+++ b/src/include/ext/pymodule_ext.h
@@ -2,10 +2,10 @@
 #define PYMODULE_EXT_H
 
     #include <sys/types.h>
-    #include "../common.h"
-    #include "../options.h"
-    #include "../util/distance_utils.h"
-    #include "../fit_tsne.h"
+    #include "common.h"
+    #include "options.h"
+    #include "util/distance_utils.h"
+    #include "fit_tsne.h"
 
     extern "C" {
         /**
diff --git a/src/kernels/attr_forces.cu b/src/kernels/attr_forces.cu
index 1d83778..498c197 100644
--- a/src/kernels/attr_forces.cu
+++ b/src/kernels/attr_forces.cu
@@ -6,7 +6,7 @@
     Attractive force is given by pij*qij.
 */
 
-#include "../include/kernels/attr_forces.h"
+#include "kernels/attr_forces.h"
 
 __global__
 void tsnecuda::ComputePijxQijKernel(
diff --git a/src/kernels/perplexity_search.cu b/src/kernels/perplexity_search.cu
index 884979f..ed16529 100644
--- a/src/kernels/perplexity_search.cu
+++ b/src/kernels/perplexity_search.cu
@@ -5,7 +5,7 @@
     Note that FAISS returns the first row as the same point, with distance = 0. pii is defined as zero.
 */
 
-#include "../include/kernels/perplexity_search.h"
+#include "kernels/perplexity_search.h"
 
 __global__
 void tsnecuda::ComputePijKernel(
diff --git a/src/util/cuda_utils.cu b/src/util/cuda_utils.cu
index 78edaf4..579a5fb 100644
--- a/src/util/cuda_utils.cu
+++ b/src/util/cuda_utils.cu
@@ -31,7 +31,7 @@
     or send an e-mail to: info@orangeowlsolutions.com
 */
 
-#include "../include/util/cuda_utils.h"
+#include "util/cuda_utils.h"
 
 /*******************/
 /* iDivUp FUNCTION */

From 81b925fe64aedac5953b8450762e682ee2c60527 Mon Sep 17 00:00:00 2001
From: Roshan Rao <roshan_rao@berkeley.edu>
Date: Mon, 10 Jun 2019 18:25:05 -0700
Subject: [PATCH 14/14] re-add the different architectures again

---
 CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 20e04d7..16bd30e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,13 +58,13 @@ SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};
                     -O3
                     -Xptxas -dlcm=cg
-                    # -gencode=arch=compute_30,code=sm_30
-                    # -gencode=arch=compute_35,code=sm_35
-                    # -gencode=arch=compute_50,code=sm_50
-                    # -gencode=arch=compute_52,code=sm_52
-                    # -gencode=arch=compute_60,code=sm_60
+                    -gencode=arch=compute_30,code=sm_30
+                    -gencode=arch=compute_35,code=sm_35
+                    -gencode=arch=compute_50,code=sm_50
+                    -gencode=arch=compute_52,code=sm_52
+                    -gencode=arch=compute_60,code=sm_60
 		            -gencode=arch=compute_61,code=sm_61
-                    # -gencode=arch=compute_70,code=sm_70
+                    -gencode=arch=compute_70,code=sm_70
                     -std=c++11
                     -Xcompiler '-O3'
                     -Xcompiler '-fPIC'