From 9a8340f15ab86be3119f25e224892f8839bac23d Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 15:35:28 -0700 Subject: [PATCH 01/14] export compile commands --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b3e903..2b69752 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,7 @@ set (CMAKE_PROJECT_VERSION_MAJOR 1) set (CMAKE_PROJECT_VERSION_MINOR 0) set (CMAKE_PROJECT_VERSION_PATH 0) set (CMAKE_SKIP_RULE_DEPENDENCY TRUE) +set (CMAKE_EXPORT_COMPILE_COMMANDS 1) list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) # Options From d11dca491d5e2341f0e61686416471e5fef1ffd1 Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 15:46:17 -0700 Subject: [PATCH 02/14] update gitignore --- .gitignore | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0fc4f9c..5b8c3ab 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ tsne_utils +compile_commands.json *.o *.so @@ -24,7 +25,7 @@ Multicore-TSNE/dist/* build/* # exception to the rule -!build/.gitkeep +!build/.gitkeep # Ignore animation visualization/animation.gif @@ -32,3 +33,7 @@ visualization/animation.gif .DS_Store .mypy_cache/ .vscode/ + +# vim +*.swp +*.swo From 6fe82dac7bdd29b661e70fff314a36771565302f Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 15:47:54 -0700 Subject: [PATCH 03/14] remove compute architectures for easier compilation when debugging --- CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b69752..a9e0855 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,13 +58,13 @@ SET(CUDA_PROPAGATE_HOST_FLAGS OFF) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -O3 -Xptxas -dlcm=cg - -gencode=arch=compute_30,code=sm_30 - -gencode=arch=compute_35,code=sm_35 - -gencode=arch=compute_50,code=sm_50 - -gencode=arch=compute_52,code=sm_52 - -gencode=arch=compute_60,code=sm_60 + # -gencode=arch=compute_30,code=sm_30 + # -gencode=arch=compute_35,code=sm_35 + # -gencode=arch=compute_50,code=sm_50 + # -gencode=arch=compute_52,code=sm_52 + # -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 - -gencode=arch=compute_70,code=sm_70 + # -gencode=arch=compute_70,code=sm_70 -std=c++11 -Xcompiler '-O3' -Xcompiler '-fPIC' From da8ee2c91b123c5c307bba6bb9b0a2accb58e060 Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 16:40:57 -0700 Subject: [PATCH 04/14] delete unused files, naive tsne files, bh tsne namespace --- CMakeLists.txt | 6 +- src/Makefile | 15 - src/exe/main.cu | 33 +- src/ext/pymodule_ext.cu | 4 +- src/{bh_tsne.cu => fit_tsne.cu} | 20 +- src/include/ext/pymodule_ext.h | 9 +- src/include/{bh_tsne.h => fit_tsne.h} | 5 +- src/include/kernels/apply_forces.h | 37 +- .../{bh_attr_forces.h => attr_forces.h} | 9 +- src/include/kernels/bh_rep_forces.h | 62 --- src/include/kernels/bounding_box.h | 53 --- src/include/kernels/initialization.h | 30 -- src/include/kernels/perplexity_search.h | 30 +- src/include/kernels/tree_builder.h | 49 --- src/include/kernels/tree_sort.h | 42 -- src/include/kernels/tree_summary.h | 45 -- src/include/naive_tsne.h | 149 ------- src/include/naive_tsne_cpu.h | 87 ---- src/include/nbodyfft.h | 41 +- src/include/tsne_vars.h | 20 - src/kernels/apply_forces.cu | 72 +-- .../{bh_attr_forces.cu => attr_forces.cu} | 8 +- src/kernels/bh_rep_forces.cu | 171 ------- src/kernels/bounding_box.cu | 133 ------ src/kernels/initialization.cu | 9 +- src/kernels/perplexity_search.cu | 96 +--- src/kernels/tree_builder.cu | 189 -------- src/kernels/tree_sort.cu | 72 --- src/kernels/tree_summary.cu | 166 ------- src/naive_tsne.cu | 416 ------------------ src/naive_tsne_cpu.cu | 183 -------- src/nbodyfft.cu | 124 +++--- 32 files changed, 139 insertions(+), 2246 deletions(-) delete mode 100644 src/Makefile rename src/{bh_tsne.cu => fit_tsne.cu} (98%) rename src/include/{bh_tsne.h => fit_tsne.h} (87%) rename src/include/kernels/{bh_attr_forces.h => attr_forces.h} (92%) delete mode 100644 src/include/kernels/bh_rep_forces.h delete mode 100644 src/include/kernels/bounding_box.h delete mode 100644 src/include/kernels/initialization.h delete mode 100644 src/include/kernels/tree_builder.h delete mode 100644 src/include/kernels/tree_sort.h delete mode 100644 src/include/kernels/tree_summary.h delete mode 100644 src/include/naive_tsne.h delete mode 100644 src/include/naive_tsne_cpu.h delete mode 100644 src/include/tsne_vars.h rename src/kernels/{bh_attr_forces.cu => attr_forces.cu} (92%) delete mode 100644 src/kernels/bh_rep_forces.cu delete mode 100644 src/kernels/bounding_box.cu delete mode 100644 src/kernels/tree_builder.cu delete mode 100644 src/kernels/tree_sort.cu delete mode 100644 src/kernels/tree_summary.cu delete mode 100644 src/naive_tsne.cu delete mode 100644 src/naive_tsne_cpu.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index a9e0855..62a94e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,15 +150,13 @@ set(SOURCES # Kernels src/kernels/apply_forces.cu - src/kernels/bh_attr_forces.cu + src/kernels/attr_forces.cu src/kernels/perplexity_search.cu - src/kernels/initialization.cu # Method files src/ext/pymodule_ext.cu - src/bh_tsne.cu + src/fit_tsne.cu src/nbodyfft.cu - # src/naive_tsne.cu ) diff --git a/src/Makefile b/src/Makefile deleted file mode 100644 index a3c5243..0000000 --- a/src/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -APPS := bh -FLAGS := -O3 -arch=sm_61 -g -Xptxas -v -NVCC := nvcc -GCC := g++ -CC := $(GCC) - -.PHONY: all - -all: bh sparse - -bh : bh.cu - $(NVCC) $(FLAGS) -DVARIANT=0 -o $@ $< - -sparse : sparse.cu - $(NVCC) $(FLAGS) -lcusparse -o $@ $< diff --git a/src/exe/main.cu b/src/exe/main.cu index ce4c95d..a3432f1 100644 --- a/src/exe/main.cu +++ b/src/exe/main.cu @@ -1,16 +1,13 @@ - -// This file exposes a main file which does most of the testing with command line +// This file exposes a main file which does most of the testing with command line // args, so we don't have to re-build to change options. // Detailed includes -#include "common.h" -#include "util/data_utils.h" -#include "util/cuda_utils.h" -#include "util/random_utils.h" -#include "util/distance_utils.h" -#include "naive_tsne.h" -#include "naive_tsne_cpu.h" -#include "bh_tsne.h" +// #include "../include/common.h" +#include "../include/util/data_utils.h" +// #include "../include/util/cuda_utils.h" +// #include "../include/util/random_utils.h" +// #include "../include/util/distance_utils.h" +#include "../include/fit_tsne.h" #include #include @@ -45,7 +42,7 @@ int main(int argc, char** argv) { ("q,dim", "Point Dimensions", cxxopts::value()->default_value("50")) ("j,device", "Device to run on", cxxopts::value()->default_value("0")) ("h,help", "Print help"); - + // Parse command line options auto result = options.parse(argc, argv); @@ -98,7 +95,7 @@ int main(int argc, char** argv) { } // Do the t-SNE - tsnecuda::bh::RunTsne(opt, gpu_opt); + tsnecuda::RunTsne(opt, gpu_opt); // Clean up the data delete[] data; @@ -134,7 +131,7 @@ int main(int argc, char** argv) { } // Do the t-SNE - tsnecuda::bh::RunTsne(opt, gpu_opt); + tsnecuda::RunTsne(opt, gpu_opt); // Clean up the data delete[] data; @@ -151,7 +148,7 @@ int main(int argc, char** argv) { // DO the T-SNE printf("Starting TSNE calculation with %u points.\n", num_images); - + // Construct the options tsnecuda::Options opt(nullptr, data, num_images, num_columns*num_rows*num_channels); opt.perplexity = FOPT(perplexity); @@ -171,8 +168,8 @@ int main(int argc, char** argv) { } // Do the t-SNE - tsnecuda::bh::RunTsne(opt, gpu_opt); - + tsnecuda::RunTsne(opt, gpu_opt); + // Clean up the data delete[] data; @@ -194,7 +191,7 @@ int main(int argc, char** argv) { // Do the T-SNE printf("Starting TSNE calculation with %u points.\n", IOPT(num-points)); - + // Construct the options tsnecuda::Options opt(nullptr, thrust::raw_pointer_cast(h_X.data()), IOPT(num-points), IOPT(dim)); opt.perplexity = FOPT(perplexity); @@ -214,7 +211,7 @@ int main(int argc, char** argv) { } // Do the t-SNE - tsnecuda::bh::RunTsne(opt, gpu_opt); + tsnecuda::RunTsne(opt, gpu_opt); } else { std::cout << "Dataset not recognized..." << std::endl; diff --git a/src/ext/pymodule_ext.cu b/src/ext/pymodule_ext.cu index 567b13b..7c2dff3 100644 --- a/src/ext/pymodule_ext.cu +++ b/src/ext/pymodule_ext.cu @@ -1,7 +1,7 @@ // Implementation file for the python extensions -#include "include/ext/pymodule_ext.h" +#include "../include/ext/pymodule_ext.h" void pymodule_bh_tsne(float *result, float* points, @@ -108,7 +108,7 @@ void pymodule_bh_tsne(float *result, } // Do the t-SNE - tsnecuda::bh::RunTsne(opt, gpu_opt); + tsnecuda::RunTsne(opt, gpu_opt); // Copy the data back from the GPU cudaDeviceSynchronize(); diff --git a/src/bh_tsne.cu b/src/fit_tsne.cu similarity index 98% rename from src/bh_tsne.cu rename to src/fit_tsne.cu index 7183474..28cffc2 100644 --- a/src/bh_tsne.cu +++ b/src/fit_tsne.cu @@ -2,8 +2,8 @@ Compute t-SNE via Barnes-Hut for NlogN time. */ -#include "bh_tsne.h" -#include "nbodyfft.h" +#include "include/fit_tsne.h" +#include "include/nbodyfft.h" #include #define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) @@ -200,8 +200,8 @@ void compute_chargesQij( num_points, n_terms); } -void tsnecuda::bh::RunTsne(tsnecuda::Options &opt, - tsnecuda::GpuOptions &gpu_opt) +void tsnecuda::RunTsne(tsnecuda::Options &opt, + tsnecuda::GpuOptions &gpu_opt) { auto start = std::chrono::high_resolution_clock::now(); auto stop = std::chrono::high_resolution_clock::now(); @@ -279,7 +279,7 @@ void tsnecuda::bh::RunTsne(tsnecuda::Options &opt, // Initialize global variables thrust::device_vector err_device(1); - tsnecuda::bh::Initialize(gpu_opt, err_device); + // tsnecuda::Initialize(gpu_opt, err_device); END_IL_TIMER(_time_initialization); START_IL_TIMER(); @@ -301,7 +301,7 @@ void tsnecuda::bh::RunTsne(tsnecuda::Options &opt, // Search Perplexity thrust::device_vector pij_non_symmetric_device(num_points * num_neighbors); - tsnecuda::bh::SearchPerplexity(gpu_opt, dense_handle, pij_non_symmetric_device, knn_squared_distances_device, + tsnecuda::SearchPerplexity(gpu_opt, dense_handle, pij_non_symmetric_device, knn_squared_distances_device, perplexity, perplexity_search_epsilon, num_points, num_neighbors); // Clean up memory @@ -560,7 +560,7 @@ void tsnecuda::bh::RunTsne(tsnecuda::Options &opt, // Compute the number of boxes in a single dimension and the total number of boxes in 2d // auto n_boxes_per_dim = static_cast(fmax(min_num_intervals, (max_coord - min_coord) / intervals_per_integer)); - precompute_2d( + tsnecuda::PrecomputeFFT2D( plan_kernel_tilde, max_coord, min_coord, max_coord, min_coord, n_boxes_per_dim, n_interpolation_points, box_lower_bounds_device, box_upper_bounds_device, kernel_tilde_device, fft_kernel_tilde_device); @@ -571,7 +571,7 @@ void tsnecuda::bh::RunTsne(tsnecuda::Options &opt, START_IL_TIMER(); - n_body_fft_2d( + tsnecuda::NbodyFFT2D( plan_dft, plan_idft, N, n_terms, n_boxes_per_dim, n_interpolation_points, fft_kernel_tilde_device, n_total_boxes, @@ -600,7 +600,7 @@ void tsnecuda::bh::RunTsne(tsnecuda::Options &opt, // Calculate Attractive Forces - tsnecuda::bh::ComputeAttractiveForces(gpu_opt, + tsnecuda::ComputeAttractiveForces(gpu_opt, sparse_handle, sparse_matrix_descriptor, attractive_forces_device, @@ -618,7 +618,7 @@ void tsnecuda::bh::RunTsne(tsnecuda::Options &opt, START_IL_TIMER(); // Apply Forces - tsnecuda::bh::ApplyForces(gpu_opt, + tsnecuda::ApplyForces(gpu_opt, points_device, attractive_forces_device, repulsive_forces_device, diff --git a/src/include/ext/pymodule_ext.h b/src/include/ext/pymodule_ext.h index f6ce78e..ef9f6e9 100644 --- a/src/include/ext/pymodule_ext.h +++ b/src/include/ext/pymodule_ext.h @@ -1,12 +1,11 @@ - #ifndef PYMODULE_EXT_H #define PYMODULE_EXT_H #include - #include "common.h" - #include "include/options.h" - #include "util/distance_utils.h" - #include "bh_tsne.h" + #include "../common.h" + #include "../options.h" + #include "../util/distance_utils.h" + #include "../fit_tsne.h" extern "C" { /** diff --git a/src/include/bh_tsne.h b/src/include/fit_tsne.h similarity index 87% rename from src/include/bh_tsne.h rename to src/include/fit_tsne.h index ae0c2eb..87387d9 100644 --- a/src/include/bh_tsne.h +++ b/src/include/fit_tsne.h @@ -21,15 +21,12 @@ #include "include/util/thrust_transform_functions.h" #include "include/kernels/apply_forces.h" -#include "include/kernels/bh_attr_forces.h" +#include "include/kernels/attr_forces.h" #include "include/kernels/perplexity_search.h" -#include "include/kernels/initialization.h" namespace tsnecuda { -namespace bh { void RunTsne(tsnecuda::Options &opt, tsnecuda::GpuOptions &gpu_opt); } -} #endif diff --git a/src/include/kernels/apply_forces.h b/src/include/kernels/apply_forces.h index 7d399f4..9aeb815 100644 --- a/src/include/kernels/apply_forces.h +++ b/src/include/kernels/apply_forces.h @@ -10,12 +10,11 @@ #ifndef SRC_INCLUDE_KERNELS_APPLY_FORCES_H_ #define SRC_INCLUDE_KERNELS_APPLY_FORCES_H_ -#include "include/common.h" -#include "include/options.h" -#include "include/util/cuda_utils.h" - +#include "../common.h" +#include "../options.h" +#include "../util/cuda_utils.h" + namespace tsnecuda { -namespace bh { __global__ void IntegrationKernel( volatile float * __restrict__ points, @@ -36,7 +35,7 @@ void ApplyForces( thrust::device_vector &points, thrust::device_vector &attr_forces, thrust::device_vector &rep_forces, - thrust::device_vector &gains, + thrust::device_vector &gains, thrust::device_vector &old_forces, const float eta, const float normalization, @@ -47,30 +46,4 @@ void ApplyForces( const int num_blocks ); } - -namespace naive { - -__global__ -void IntegrationKernel( - volatile float * __restrict__ points, - volatile float * __restrict__ forces, - volatile float * __restrict__ gains, - volatile float * __restrict__ old_forces, - const float eta, - const float momentum, - const int num_points - ); - - -void ApplyForces(thrust::device_vector &points, - thrust::device_vector &forces, - thrust::device_vector &gains, - thrust::device_vector &old_forces, - const float eta, - const float momentum, - const int num_points, - const int num_blocks); - -} -} #endif diff --git a/src/include/kernels/bh_attr_forces.h b/src/include/kernels/attr_forces.h similarity index 92% rename from src/include/kernels/bh_attr_forces.h rename to src/include/kernels/attr_forces.h index 174fe1e..59e3404 100644 --- a/src/include/kernels/bh_attr_forces.h +++ b/src/include/kernels/attr_forces.h @@ -9,12 +9,12 @@ #ifndef SRC_INCLUDE_KERNELS_BH_ATTR_FORCES_H_ #define SRC_INCLUDE_KERNELS_BH_ATTR_FORCES_H_ -#include "include/common.h" -#include "include/options.h" -#include "include/util/cuda_utils.h" +#include "../common.h" +#include "../options.h" +#include "../util/cuda_utils.h" namespace tsnecuda { -namespace bh { + __global__ void ComputePijxQijKernel( float * __restrict__ attr_forces, @@ -39,6 +39,5 @@ void ComputeAttractiveForces( const int num_points, const int num_nonzero); } -} #endif diff --git a/src/include/kernels/bh_rep_forces.h b/src/include/kernels/bh_rep_forces.h deleted file mode 100644 index 54a9cb7..0000000 --- a/src/include/kernels/bh_rep_forces.h +++ /dev/null @@ -1,62 +0,0 @@ -/** - * @brief Kernels for computing t-SNE repulsive forces with barnes hut approximation. - * - * @file apply_forces.cu - * @author Roshan Rao - * @date 2018-05-08 - * Copyright (c) 2018, Regents of the University of California - */ -#ifndef SRC_INCLUDE_KERNELS_BH_REP_FORCES_H_ -#define SRC_INCLUDE_KERNELS_BH_REP_FORCES_H_ - -#include "include/common.h" -#include "include/options.h" -#include "include/util/cuda_utils.h" - -// TSNE-Vars -extern __device__ volatile int stepd, bottomd, maxdepthd; -extern __device__ unsigned int blkcntd; -extern __device__ volatile float radiusd; - -namespace tsnecuda { -namespace bh { - -/******************************************************************************/ -/*** compute force ************************************************************/ -/******************************************************************************/ - -__global__ -void ForceCalculationKernel(volatile int * __restrict__ errd, - volatile float * __restrict__ x_vel_device, - volatile float * __restrict__ y_vel_device, - volatile float * __restrict__ normalization_vec_device, - const int * __restrict__ cell_sorted, - const int * __restrict__ children, - const float * __restrict__ cell_mass, - volatile float * __restrict__ x_pos_device, - volatile float * __restrict__ y_pos_device, - const float theta, - const float epsilon, - const int num_nodes, - const int num_points, - const int maxdepth_bh_tree, - const int repulsive_force_threads); - -void ComputeRepulsiveForces(tsnecuda::GpuOptions &gpu_opt, - thrust::device_vector &errd, - thrust::device_vector &repulsive_forces, - thrust::device_vector &normalization_vec, - thrust::device_vector &cell_sorted, - thrust::device_vector &children, - thrust::device_vector &cell_mass, - thrust::device_vector &points, - const float theta, - const float epsilon, - const int num_nodes, - const int num_points, - const int num_blocks); - -} -} - -#endif diff --git a/src/include/kernels/bounding_box.h b/src/include/kernels/bounding_box.h deleted file mode 100644 index 573154e..0000000 --- a/src/include/kernels/bounding_box.h +++ /dev/null @@ -1,53 +0,0 @@ -/** - * @brief Kernels for computing bounding box of points in 2 dimensions. - * - * @file apply_forces.cu - * @author Roshan Rao - * @date 2018-05-08 - * Copyright (c) 2018, Regents of the University of California - */ -#ifndef SRC_INCLUDE_KERNELS_BOUNDING_BOX_H_ -#define SRC_INCLUDE_KERNELS_BOUNDING_BOX_H_ - -#include "include/common.h" -#include "include/options.h" -#include "include/util/cuda_utils.h" - -//TSNE-Vars -extern __device__ volatile int stepd, bottomd, maxdepthd; -extern __device__ unsigned int blkcntd; -extern __device__ volatile float radiusd; - -namespace tsnecuda { -namespace bh { -__global__ -void BoundingBoxKernel( - volatile int * __restrict__ cell_starts, - volatile int * __restrict__ children, - volatile float * __restrict__ cell_mass, - volatile float * __restrict__ x_pos_device, - volatile float * __restrict__ y_pos_device, - volatile float * __restrict__ x_max_device, - volatile float * __restrict__ y_max_device, - volatile float * __restrict__ x_min_device, - volatile float * __restrict__ y_min_device, - const int num_nodes, - const int num_points, - const int bounding_box_threads); - -void ComputeBoundingBox(tsnecuda::GpuOptions &gpu_opt, - thrust::device_vector &cell_starts, - thrust::device_vector &children, - thrust::device_vector &cell_mass, - thrust::device_vector &points, - thrust::device_vector &x_max_device, - thrust::device_vector &y_max_device, - thrust::device_vector &x_min_device, - thrust::device_vector &y_min_device, - const int num_nodes, - const int num_points, - const int num_blocks); -} -} - -#endif diff --git a/src/include/kernels/initialization.h b/src/include/kernels/initialization.h deleted file mode 100644 index 199abab..0000000 --- a/src/include/kernels/initialization.h +++ /dev/null @@ -1,30 +0,0 @@ -/** - * @brief Kernels for computing t-SNE attractive forces with nearest neighbor approximation. - * - * @file apply_forces.cu - * @author Roshan Rao - * @date 2018-05-08 - * Copyright (c) 2018, Regents of the University of California - */ -#ifndef SRC_INCLUDE_KERNELS_INITIALIZATION_H_ -#define SRC_INCLUDE_KERNELS_INITIALIZATION_H_ - -#include "include/common.h" -#include "include/options.h" -#include "include/util/cuda_utils.h" - -#include "include/kernels/apply_forces.h" -#include "include/kernels/bh_attr_forces.h" -#include "include/kernels/perplexity_search.h" - -namespace tsnecuda { -namespace bh { -__global__ void InitializationKernel(int * __restrict errd); -void Initialize(tsnecuda::GpuOptions &gpu_opt, thrust::device_vector &errd); -} -namespace naive { -void Initialize(); -} -} - -#endif diff --git a/src/include/kernels/perplexity_search.h b/src/include/kernels/perplexity_search.h index 1d05576..94d76e0 100644 --- a/src/include/kernels/perplexity_search.h +++ b/src/include/kernels/perplexity_search.h @@ -9,12 +9,12 @@ #ifndef SRC_INCLUDE_KERNELS_PERPLEXITY_SEARCH_H_ #define SRC_INCLUDE_KERNELS_PERPLEXITY_SEARCH_H_ -#include "include/common.h" -#include "include/options.h" -#include "include/util/cuda_utils.h" -#include "include/util/reduce_utils.h" -#include "include/util/matrix_broadcast_utils.h" -#include "include/util/thrust_transform_functions.h" +#include "../common.h" +#include "../options.h" +#include "../util/cuda_utils.h" +#include "../util/reduce_utils.h" +#include "../util/matrix_broadcast_utils.h" +#include "../util/thrust_transform_functions.h" namespace tsnecuda { __global__ @@ -28,7 +28,6 @@ void PerplexitySearchKernel( const float perplexity_target, const float epsilon, const int num_points); -namespace bh { __global__ void ComputePijKernel( volatile float * __restrict__ pij, @@ -47,21 +46,4 @@ void SearchPerplexity(tsnecuda::GpuOptions &gpu_opt, const int num_near_neighbors); } -namespace naive { -__global__ -void ComputePijKernel( - volatile float * __restrict__ pij, - const float * __restrict__ squared_dist, - const float * __restrict__ betas, - const unsigned int num_points); - -void SearchPerplexity(cublasHandle_t &handle, - thrust::device_vector &pij, - thrust::device_vector &squared_dist, - const float perplexity_target, - const float epsilon, - const int num_points); -} -} - #endif diff --git a/src/include/kernels/tree_builder.h b/src/include/kernels/tree_builder.h deleted file mode 100644 index b56012d..0000000 --- a/src/include/kernels/tree_builder.h +++ /dev/null @@ -1,49 +0,0 @@ -/** - * @brief Kernels for computing t-SNE attractive forces with nearest neighbor approximation. - * - * @file apply_forces.cu - * @author Roshan Rao - * @date 2018-05-08 - * Copyright (c) 2018, Regents of the University of California - */ -#ifndef SRC_INCLUDE_KERNELS_TREE_BUILDER_H_ -#define SRC_INCLUDE_KERNELS_TREE_BUILDER_H_ - -#include "include/common.h" -#include "include/options.h" -#include "include/util/cuda_utils.h" - -//TSNE-Vars -extern __device__ volatile int stepd, bottomd, maxdepthd; -extern __device__ unsigned int blkcntd; -extern __device__ volatile float radiusd; - -namespace tsnecuda { -namespace bh { -__global__ -void ClearKernel1(volatile int * __restrict__ children, const int num_nodes, const int num_points); - -__global__ -void TreeBuildingKernel(volatile int * __restrict__ errd, - volatile int * __restrict__ children, - volatile float * __restrict__ x_pos_device, - volatile float * __restrict__ y_pos_device, - const int num_nodes, - const int num_points); - -__global__ -void ClearKernel2(volatile int * __restrict__ cell_starts, volatile float * __restrict__ cell_mass, const int num_nodes); - -void BuildTree(tsnecuda::GpuOptions &gpu_opt, - thrust::device_vector &errd, - thrust::device_vector &children, - thrust::device_vector &cell_starts, - thrust::device_vector &cell_mass, - thrust::device_vector &points, - const int num_nodes, - const int num_points, - const int num_blocks); -} -} - -#endif diff --git a/src/include/kernels/tree_sort.h b/src/include/kernels/tree_sort.h deleted file mode 100644 index 7f66193..0000000 --- a/src/include/kernels/tree_sort.h +++ /dev/null @@ -1,42 +0,0 @@ -/** - * @brief Kernels for sorting tree cells by morton code. - * - * @file tree_sort.cu - * @author Roshan Rao - * @date 2018-05-08 - * Copyright (c) 2018, Regents of the University of California - */ -#ifndef SRC_INCLUDE_KERNELS_TREE_SORT_H_ -#define SRC_INCLUDE_KERNELS_TREE_SORT_H_ - -#include "include/common.h" -#include "include/options.h" -#include "include/util/cuda_utils.h" - -//TSNE-Vars -extern __device__ volatile int stepd, bottomd, maxdepthd; -extern __device__ unsigned int blkcntd; -extern __device__ volatile float radiusd; - -namespace tsnecuda { -namespace bh { -__global__ -void SortKernel(int * __restrict__ cell_sorted, - volatile int * __restrict__ cell_starts, - int * __restrict__ children, - const int * __restrict__ cell_counts, - const int num_nodes, - const int num_points); - -void SortCells(tsnecuda::GpuOptions &gpu_opt, - thrust::device_vector &cell_sorted, - thrust::device_vector &cell_starts, - thrust::device_vector &children, - thrust::device_vector &cell_counts, - const int num_nodes, - const int num_points, - const int num_blocks); -} -} - -#endif diff --git a/src/include/kernels/tree_summary.h b/src/include/kernels/tree_summary.h deleted file mode 100644 index b12a7c8..0000000 --- a/src/include/kernels/tree_summary.h +++ /dev/null @@ -1,45 +0,0 @@ -/** - * @brief Kernels for computing summary of a quad tree. - * - * @file apply_forces.cu - * @author Roshan Rao - * @date 2018-05-08 - * Copyright (c) 2018, Regents of the University of California - */ -#ifndef SRC_INCLUDE_KERNELS_TREE_SUMMARY_H_ -#define SRC_INCLUDE_KERNELS_TREE_SUMMARY_H_ - -#include "include/common.h" -#include "include/options.h" -#include "include/util/cuda_utils.h" - -//TSNE-Vars -extern __device__ volatile int stepd, bottomd, maxdepthd; -extern __device__ unsigned int blkcntd; -extern __device__ volatile float radiusd; - -namespace tsnecuda { -namespace bh { -__global__ -void SummarizationKernel( - volatile int * __restrict cell_counts, - volatile float * __restrict cell_mass, - volatile float * __restrict x_pos_device, - volatile float * __restrict y_pos_device, - const int * __restrict children, - const int num_nodes, - const int num_points, - const int gpu_summary_threads); - -void SummarizeTree(tsnecuda::GpuOptions &gpu_opt, - thrust::device_vector &cell_counts, - thrust::device_vector &children, - thrust::device_vector &cell_mass, - thrust::device_vector &pts_device, - const int num_nodes, - const int num_points, - const int num_blocks); -} -} - -#endif diff --git a/src/include/naive_tsne.h b/src/include/naive_tsne.h deleted file mode 100644 index 1101c1f..0000000 --- a/src/include/naive_tsne.h +++ /dev/null @@ -1,149 +0,0 @@ -/** - * @brief Naive implementation of T-SNE O(n^2) - * - * @file naive_tsne.h - * @author David Chan - * @date 2018-04-04 - */ - -#ifndef NAIVE_TSNE_H -#define NAIVE_TSNE_H - -#include "common.h" -#include "util/cuda_utils.h" -#include "util/math_utils.h" -#include "util/matrix_broadcast_utils.h" -#include "util/reduce_utils.h" -#include "util/distance_utils.h" -#include "util/random_utils.h" -#include "util/thrust_utils.h" -#include "include/util/thrust_transform_functions.h" -#include "include/util/debug_utils.h" - -namespace NaiveTSNE { - /** - * @brief Compute the Pij distribution O(n^2) - * - * @param handle CUBLAS handle - * @param points The points in an NxNDIM column-major array - * @param sigma The list of sigmas for each point - * @param pij The computed pij value - * @param N The number of points - * @param NDIMS The number of dimensions for each point - */ - void compute_pij(cublasHandle_t &handle, - thrust::device_vector &pij, - const thrust::device_vector &points, - const thrust::device_vector &sigma, - const unsigned int N, - const unsigned int NDIMS); - /** - * @brief Compute the Pij based on P(i|j) using Pij = P(i|j) + P(j|i)/2N O(n^2) - * - * @param handle CUBLAS handle - * @param pij_vals Computed P(i|j) values - * @param N The number of points - */ - void symmetrize_pij(cublasHandle_t &handle, - thrust::device_vector &pij, - const unsigned int N); - - /** - * @brief Searches the right sigmas for computing pij - * - * @param handle CUBLAS handle - * @param points Original Points - * @param perplexity_target Target perplexity for the Search - * @param eps Error tolerance for the perplexity search - * @param N The number of points - * @param NDIMS Number of Dimensions of points - * @return thrust::device_vector Computed Sigmas based on the perplexity target - */ - thrust::device_vector search_perplexity(cublasHandle_t &handle, - thrust::device_vector &points, - const float perplexity_target, - const float eps, - const unsigned int N, - const unsigned int NDIMS); - - ///@private - void thrust_search_perplexity(cublasHandle_t &handle, - thrust::device_vector &sigmas, - thrust::device_vector &lower_bound, - thrust::device_vector &upper_bound, - thrust::device_vector &perplexity, - const thrust::device_vector &pij, - const float target_perplexity, - const unsigned int N); - - /** - * @brief Compute the T-SNE gradients - * - * @param handle CUBLAS handle - * @param forces The forces output array - * @param dist Placeholder array for the distances - * @param ys The current projected points - * @param pij The P(i|j) distribution - * @param qij Placeholder for the computed Q(i|j) distribution - * @param N The number of points - * @param PROJDIM The number of dimensions to project into - * @param eta The learning rate - * @return float Returns the K-L Divergence of the two distributions (as the loss) - */ - float compute_gradients(cublasHandle_t &handle, - thrust::device_vector &forces, - thrust::device_vector &dist, - thrust::device_vector &ys, - thrust::device_vector &pij, - thrust::device_vector &qij, - const unsigned int N, - const unsigned int PROJDIM, - float eta); - - /** - * @brief Perform T-SNE using the naive O(n^2) forces computation - * - * @param handle The CUBLAS handle to use - * @param points The array of points in column-major NxNDIM matrix - * @param N The number of points - * @param NDIMS The number of dimentions the original points are in - * @param PROJDIM The number of dimensions to project to - * @return thrust::device_vector The projected points - */ - thrust::device_vector tsne(cublasHandle_t &handle, - thrust::device_vector &points, - const unsigned int N, - const unsigned int NDIMS, - const unsigned int PROJDIM); - - /** - * @brief Perform T-SNE using the naive O(n^2) forces method with some - * additional parameters - * - * @param handle The CUBLAS handle to use - * @param d_points The array of points in coloumn-major NxNDIM matrix - * @param N_POINTS The number of points that we're using - * @param N_DIMS The number of dimensions that the original points are in - * @param proj_dim The number of dimensions we're projecting to - * @param perplexity The perplexity that we're shooting for in our distribution - * @param early_ex The amount of early exaggeration to use - * @param learning_rate The learning rate to use - * @param n_iter The number of iterations to perform - * @param n_iter_np The number of iterations to go without progress before early termination - * @param min_g_norm The norm of the forces to keep above - * @return thrust::device_vector The projected points - */ - thrust::device_vector tsne(cublasHandle_t &handle, - thrust::device_vector &d_points, - unsigned int N_POINTS, - unsigned int N_DIMS, - unsigned int proj_dim, - float perplexity, - float early_ex, - float learning_rate, - unsigned int n_iter, - unsigned int n_iter_np, - float min_g_norm); -} - -#endif diff --git a/src/include/naive_tsne_cpu.h b/src/include/naive_tsne_cpu.h deleted file mode 100644 index 60a8362..0000000 --- a/src/include/naive_tsne_cpu.h +++ /dev/null @@ -1,87 +0,0 @@ -/** - * @brief Naive implementation of T-SNE O(n^2) on CPU - * - * @file naive_tsne_cpu.h - * @author Forrest Huang - * @date 2018-04-11 - */ - -#ifndef NAIVE_TSNE_CPU_H -#define NAIVE_TSNE_CPU_H - -#include "common.h" -#include "util/cuda_utils.h" -#include "util/math_utils.h" -#include "util/matrix_broadcast_utils.h" -#include "util/reduce_utils.h" -#include "util/distance_utils.h" -#include "util/random_utils.h" - - -std::vector squared_pairwise_dist(std::vector &points, const unsigned int N, const unsigned int NDIMS); - -std::vector sigmas_search_cpu(std::vector &points, - const unsigned int N, - const unsigned int NDIMS, - float target_perplexity); - -bool perplexity_equal(const float delta, float perplexity, float target_perplexity); - -bool compare_perplexity(std::vector& pij, - float& lo, - float& mid, - float& high, - const unsigned int i, - const unsigned int N, - const float delta, - const float target_perplexity); - -float get_perplexity(std::vector & pij, const unsigned int i, unsigned int N); - -void recompute_pij_row_cpu(std::vector &points, - std::vector &pij, - float sigma, - float i, - const unsigned int N, - const unsigned int NDIMS); - -std::vector compute_pij_cpu(std::vector &points, - std::vector &sigma, - const unsigned int N, - const unsigned int NDIMS); - -std::vector compute_qij_cpu(std::vector& ys, const unsigned int N, const unsigned int PROJDIMS); - -float kl_div(float pij, float qij); - -float compute_gradients_cpu(std::vector &forces, - std::vector &dist, - std::vector &ys, - std::vector &pij, - std::vector &qij, - const unsigned int N, - float eta); - -std::vector naive_tsne_cpu(std::vector &points, - const unsigned int N, - const unsigned int NDIMS); - - -/* -std::vector compute_qij_cpu(std::vector &ys, - const unsigned int N, - const unsigned int PROJDIM); - -float compute_gradients_cpu(std::vector &forces, - std::vector &dist, - std::vector &ys, - std::vector &pij, - std::vector &qij, - const unsigned int N, - float eta); - -std::vector naive_tsne_cpu(std::vector &points, - const unsigned int N, - const unsigned int NDIMS); -*/ -#endif diff --git a/src/include/nbodyfft.h b/src/include/nbodyfft.h index 3c5cd93..01154f5 100644 --- a/src/include/nbodyfft.h +++ b/src/include/nbodyfft.h @@ -2,26 +2,34 @@ #define NBODYFFT_H #include -#include "common.h" +#include #include +#include "common.h" +#include "include/util/cuda_utils.h" +#include "include/util/matrix_broadcast_utils.h" -using namespace std; - -typedef float (*kernel_type)(float, float); - -typedef float (*kernel_type_2d)(float, float, float, float); +namespace tsnecuda { -void precompute_2d(cufftHandle &plan_kernel_tilde, float x_max, float x_min, float y_max, float y_min, int n_boxes, int n_interpolation_points, - thrust::device_vector &box_lower_bounds_device, thrust::device_vector &box_upper_bounds_device, - thrust::device_vector &kernel_tilde_device, thrust::device_vector> &fft_kernel_tilde_device); +void PrecomputeFFT2D( + cufftHandle &plan_kernel_tilde, + float x_max, + float x_min, + float y_max, + float y_min, + int n_boxes, + int n_interpolation_points, + thrust::device_vector &box_lower_bounds_device, + thrust::device_vector &box_upper_bounds_device, + thrust::device_vector &kernel_tilde_device, + thrust::device_vector > &fft_kernel_tilde_device); -void n_body_fft_2d( +void NbodyFFT2D( cufftHandle &plan_dft, cufftHandle &plan_idft, - int N, - int n_terms, + int N, + int n_terms, int n_boxes, - int n_interpolation_points, + int n_interpolation_points, thrust::device_vector> &fft_kernel_tilde_device, int n_total_boxes, int total_interpolation_points, @@ -33,7 +41,7 @@ void n_body_fft_2d( thrust::device_vector &fft_input, thrust::device_vector> &fft_w_coefficients, thrust::device_vector &fft_output, - thrust::device_vector &point_box_idx_device, + thrust::device_vector &point_box_idx_device, thrust::device_vector &x_in_box_device, thrust::device_vector &y_in_box_device, thrust::device_vector &points_device, @@ -51,9 +59,6 @@ void n_body_fft_2d( thrust::device_vector &y_interpolated_values_device, thrust::device_vector &potentialsQij_device); -void interpolate(int n_interpolation_points, int N, const float *y_in_box, const float *y_tilde_spacings, - float *interpolated_values, const float *denominator); - -float* get_ntime(); +} #endif diff --git a/src/include/tsne_vars.h b/src/include/tsne_vars.h deleted file mode 100644 index 537bd3e..0000000 --- a/src/include/tsne_vars.h +++ /dev/null @@ -1,20 +0,0 @@ -/** - * @brief Barnes-hut t-SNE global variables - * - * @file apply_forces.cu - * @author Roshan Rao - * @date 2018-05-08 - * Copyright (c) 2018, Regents of the University of California - */ -#ifndef INITIALIZATION_VARS -#ifndef SRC_INCLUDE_KERNELS_TSNEVARS_H_ -#define SRC_INCLUDE_KERNELS_TSNEVARS_H_ - -extern __device__ volatile int stepd, bottomd, maxdepthd; -extern __device__ unsigned int blkcntd; -extern __device__ volatile float radiusd; - -#endif -#endif - - diff --git a/src/kernels/apply_forces.cu b/src/kernels/apply_forces.cu index a006223..0160993 100644 --- a/src/kernels/apply_forces.cu +++ b/src/kernels/apply_forces.cu @@ -1,9 +1,8 @@ - /* Apply forces to the points with momentum, exaggeration, etc. */ -#include "include/kernels/apply_forces.h" +#include "../include/kernels/apply_forces.h" /******************************************************************************/ @@ -11,7 +10,7 @@ /******************************************************************************/ // Edited to add momentum, repulsive, attr forces, etc. __global__ -void tsnecuda::bh::IntegrationKernel( +void tsnecuda::IntegrationKernel( volatile float * __restrict__ points, volatile float * __restrict__ attr_forces, volatile float * __restrict__ rep_forces, @@ -59,52 +58,11 @@ void tsnecuda::bh::IntegrationKernel( } } -__global__ -void tsnecuda::naive::IntegrationKernel( - volatile float * __restrict__ points, - volatile float * __restrict__ forces, - volatile float * __restrict__ gains, - volatile float * __restrict__ old_forces, - const float eta, - const float momentum, - const int num_points) -{ - register int i, inc; - register float dx, dy, ux, uy, gx, gy; - - // iterate over all bodies assigned to thread - inc = blockDim.x * gridDim.x; - for (i = threadIdx.x + blockIdx.x * blockDim.x; i < num_points; i += inc) { - ux = old_forces[i]; - uy = old_forces[num_points + i]; - gx = gains[i]; - gy = gains[num_points + i]; - dx = forces[i]; - dy = forces[num_points + i]; - - gx = (signbit(dx) != signbit(ux)) ? gx + 0.2 : gx * 0.8; - gy = (signbit(dy) != signbit(uy)) ? gy + 0.2 : gy * 0.8; - gx = (gx < 0.01) ? 0.01 : gx; - gy = (gy < 0.01) ? 0.01 : gy; - - ux = momentum * ux - eta * gx * dx; - uy = momentum * uy - eta * gy * dy; - - points[i] += ux; - points[num_points + i] += uy; - - old_forces[i] = ux; - old_forces[num_points + i] = uy; - gains[i] = gx; - gains[num_points + i] = gy; - } -} - -void tsnecuda::bh::ApplyForces(tsnecuda::GpuOptions &gpu_opt, +void tsnecuda::ApplyForces(tsnecuda::GpuOptions &gpu_opt, thrust::device_vector &points, thrust::device_vector &attr_forces, thrust::device_vector &rep_forces, - thrust::device_vector &gains, + thrust::device_vector &gains, thrust::device_vector &old_forces, const float eta, const float normalization, @@ -114,8 +72,8 @@ void tsnecuda::bh::ApplyForces(tsnecuda::GpuOptions &gpu_opt, const int num_points, const int num_blocks) { - tsnecuda::bh::IntegrationKernel<<>>( + tsnecuda::IntegrationKernel<<>>( thrust::raw_pointer_cast(points.data()), thrust::raw_pointer_cast(attr_forces.data()), thrust::raw_pointer_cast(rep_forces.data()), @@ -125,21 +83,3 @@ void tsnecuda::bh::ApplyForces(tsnecuda::GpuOptions &gpu_opt, num_nodes, num_points); GpuErrorCheck(cudaDeviceSynchronize()); } - -void tsnecuda::naive::ApplyForces(thrust::device_vector &points, - thrust::device_vector &forces, - thrust::device_vector &gains, - thrust::device_vector &old_forces, - const float eta, - const float momentum, - const int num_points, - const int num_blocks) -{ - tsnecuda::naive::IntegrationKernel<<>>( - thrust::raw_pointer_cast(points.data()), - thrust::raw_pointer_cast(forces.data()), - thrust::raw_pointer_cast(gains.data()), - thrust::raw_pointer_cast(old_forces.data()), - eta, momentum, num_points); - GpuErrorCheck(cudaDeviceSynchronize()); -} diff --git a/src/kernels/bh_attr_forces.cu b/src/kernels/attr_forces.cu similarity index 92% rename from src/kernels/bh_attr_forces.cu rename to src/kernels/attr_forces.cu index 6ebc319..1d83778 100644 --- a/src/kernels/bh_attr_forces.cu +++ b/src/kernels/attr_forces.cu @@ -6,10 +6,10 @@ Attractive force is given by pij*qij. */ -#include "include/kernels/bh_attr_forces.h" +#include "../include/kernels/attr_forces.h" __global__ -void tsnecuda::bh::ComputePijxQijKernel( +void tsnecuda::ComputePijxQijKernel( float * __restrict__ attr_forces, const float * __restrict__ pij, const float * __restrict__ points, @@ -34,7 +34,7 @@ void tsnecuda::bh::ComputePijxQijKernel( atomicAdd(attr_forces + num_nodes + 1 + i, pijqij * dy); } -void tsnecuda::bh::ComputeAttractiveForces( +void tsnecuda::ComputeAttractiveForces( tsnecuda::GpuOptions &gpu_opt, cusparseHandle_t &handle, cusparseMatDescr_t &descr, @@ -53,7 +53,7 @@ void tsnecuda::bh::ComputeAttractiveForces( // TODO: this is bad style const int BLOCKSIZE = 1024; const int NBLOCKS = iDivUp(num_nonzero, BLOCKSIZE); - tsnecuda::bh::ComputePijxQijKernel<<>>( + tsnecuda::ComputePijxQijKernel<<>>( thrust::raw_pointer_cast(attr_forces.data()), thrust::raw_pointer_cast(sparse_pij.data()), thrust::raw_pointer_cast(points.data()), diff --git a/src/kernels/bh_rep_forces.cu b/src/kernels/bh_rep_forces.cu deleted file mode 100644 index a250f6b..0000000 --- a/src/kernels/bh_rep_forces.cu +++ /dev/null @@ -1,171 +0,0 @@ -// TODO: add copyright - -/* - Traverses the tree and calculates approximate repulsive forces via Barnes-Hut. - - t-SNE repulsive forces are given by qij*qijN(y_i - y_j). This also simultaneously - calculates tne normalization constant N. - -*/ - -#include "include/kernels/bh_rep_forces.h" - -/******************************************************************************/ -/*** compute force ************************************************************/ -/******************************************************************************/ - -__global__ -void tsnecuda::bh::ForceCalculationKernel(volatile int * __restrict__ errd, - volatile float * __restrict__ x_vel_device, - volatile float * __restrict__ y_vel_device, - volatile float * __restrict__ normalization_vec_device, - const int * __restrict__ cell_sorted, - const int * __restrict__ children, - const float * __restrict__ cell_mass, - volatile float * __restrict__ x_pos_device, - volatile float * __restrict__ y_pos_device, - const float theta, - const float epsilon, - const int num_nodes, - const int num_points, - const int maxdepth_bh_tree, - const int repulsive_force_threads - ) -{ - register int i, j, k, n, depth, base, sbase, diff, pd, nd; - register float px, py, vx, vy, dx, dy, normsum, tmp, mult; - extern __shared__ int force_shared_memory[]; - int* pos = (int*) force_shared_memory; - int* node = (int*) (pos + maxdepth_bh_tree*repulsive_force_threads/32); - float* dq = (float*) (node + maxdepth_bh_tree*repulsive_force_threads/32); - - if (0 == threadIdx.x) { - dq[0] = (radiusd * radiusd) / (theta * theta); - for (i = 1; i < maxdepthd; i++) { - dq[i] = dq[i - 1] * 0.25f; // radius is halved every level of tree so squared radius is quartered - dq[i - 1] += epsilon; - } - dq[i - 1] += epsilon; - - if (maxdepthd > maxdepth_bh_tree) { - *errd = maxdepthd; - } - } - __syncthreads(); - - if (maxdepthd <= maxdepth_bh_tree) { - // figure out first thread in each warp (lane 0) - base = threadIdx.x / 32; - sbase = base * 32; - j = base * maxdepth_bh_tree; - - diff = threadIdx.x - sbase; - // make multiple copies to avoid index calculations later - if (diff < maxdepth_bh_tree) { - dq[diff+j] = dq[diff]; - } - __syncthreads(); - __threadfence_block(); - - // iterate over all bodies assigned to thread - for (k = threadIdx.x + blockIdx.x * blockDim.x; k < num_points; k += blockDim.x * gridDim.x) { - i = cell_sorted[k]; // get permuted/sorted index - // cache position info - px = x_pos_device[i]; - py = y_pos_device[i]; - - vx = 0.0f; - vy = 0.0f; - normsum = 0.0f; - - // initialize iteration stack, i.e., push root node onto stack - depth = j; - if (sbase == threadIdx.x) { - pos[j] = 0; - node[j] = num_nodes * 4; - } - - do { - // stack is not empty - pd = pos[depth]; - nd = node[depth]; - while (pd < 4) { - // node on top of stack has more children to process - n = children[nd + pd]; // load child pointer - pd++; - - if (n >= 0) { - dx = px - x_pos_device[n]; - dy = py - y_pos_device[n]; - tmp = dx*dx + dy*dy + epsilon; // distance squared plus small constant to prevent zeros - #if (CUDART_VERSION >= 9000) - if ((n < num_points) || __all_sync(__activemask(), tmp >= dq[depth])) { // check if all threads agree that cell is far enough away (or is a body) - #else - if ((n < num_points) || __all(tmp >= dq[depth])) { - #endif - // from bhtsne - sptree.cpp - tmp = 1 / (1 + tmp); - mult = cell_mass[n] * tmp; - normsum += mult; - mult *= tmp; - vx += dx * mult; - vy += dy * mult; - } else { - // push cell onto stack - if (sbase == threadIdx.x) { // maybe don't push and inc if last child - pos[depth] = pd; - node[depth] = nd; - } - depth++; - pd = 0; - nd = n * 4; - } - } else { - pd = 4; // early out because all remaining children are also zero - } - } - depth--; // done with this level - } while (depth >= j); - - if (stepd >= 0) { - // update velocity - x_vel_device[i] += vx; - y_vel_device[i] += vy; - normalization_vec_device[i] = normsum - 1.0f; // subtract one for self computation (qii) - } - } - } -} - -void tsnecuda::bh::ComputeRepulsiveForces(tsnecuda::GpuOptions &gpu_opt, - thrust::device_vector &errd, - thrust::device_vector &repulsive_forces, - thrust::device_vector &normalization_vec, - thrust::device_vector &cell_sorted, - thrust::device_vector &children, - thrust::device_vector &cell_mass, - thrust::device_vector &points, - const float theta, - const float epsilon, - const int num_nodes, - const int num_points, - const int num_blocks) -{ - tsnecuda::bh::ForceCalculationKernel<<>>( - thrust::raw_pointer_cast(errd.data()), - thrust::raw_pointer_cast(repulsive_forces.data()), - thrust::raw_pointer_cast(repulsive_forces.data() + num_nodes + 1), - thrust::raw_pointer_cast(normalization_vec.data()), - thrust::raw_pointer_cast(cell_sorted.data()), - thrust::raw_pointer_cast(children.data()), - thrust::raw_pointer_cast(cell_mass.data()), - thrust::raw_pointer_cast(points.data()), - thrust::raw_pointer_cast(points.data() + num_nodes + 1), - theta, epsilon, num_nodes, num_points, 32, //TODO: Encode this variable somewhere - gpu_opt.repulsive_kernel_threads - ); - GpuErrorCheck(cudaDeviceSynchronize()); -} diff --git a/src/kernels/bounding_box.cu b/src/kernels/bounding_box.cu deleted file mode 100644 index af0793e..0000000 --- a/src/kernels/bounding_box.cu +++ /dev/null @@ -1,133 +0,0 @@ -/* - Computes bounding box for all points for barnes hut. Also resets start, child, mass -*/ - -#include "include/kernels/bounding_box.h" - -/******************************************************************************/ -/*** compute center and radius ************************************************/ -/******************************************************************************/ - -__global__ -void tsnecuda::bh::BoundingBoxKernel( - volatile int * __restrict__ cell_starts, - volatile int * __restrict__ children, - volatile float * __restrict__ cell_mass, - volatile float * __restrict__ x_pos_device, - volatile float * __restrict__ y_pos_device, - volatile float * __restrict__ x_max_device, - volatile float * __restrict__ y_max_device, - volatile float * __restrict__ x_min_device, - volatile float * __restrict__ y_min_device, - const int num_nodes, - const int num_points, - const int bounding_box_threads) -{ - register int i, j, k, inc; - register float val, minx, maxx, miny, maxy; - extern __shared__ float bounding_shared_memory[]; - float* x_min_shared = bounding_shared_memory; - float* x_max_shared = x_min_shared + bounding_box_threads; - float* y_min_shared = x_max_shared + bounding_box_threads; - float* y_max_shared = y_min_shared + bounding_box_threads; - - // initialize with valid data (in case #bodies < #threads) - minx = maxx = x_pos_device[0]; - miny = maxy = y_pos_device[0]; - - // scan all bodies - i = threadIdx.x; - inc = bounding_box_threads * gridDim.x; - for (j = i + blockIdx.x * bounding_box_threads; j < num_points; j += inc) { - val = x_pos_device[j]; - minx = fminf(minx, val); - maxx = fmaxf(maxx, val); - val = y_pos_device[j]; - miny = fminf(miny, val); - maxy = fmaxf(maxy, val); - } - - // reduction in shared memory - x_min_shared[i] = minx; - x_max_shared[i] = maxx; - y_min_shared[i] = miny; - y_max_shared[i] = maxy; - - for (j = bounding_box_threads / 2; j > 0; j /= 2) { - __syncthreads(); - if (i < j) { - k = i + j; - x_min_shared[i] = minx = fminf(minx, x_min_shared[k]); - x_max_shared[i] = maxx = fmaxf(maxx, x_max_shared[k]); - y_min_shared[i] = miny = fminf(miny, y_min_shared[k]); - y_max_shared[i] = maxy = fmaxf(maxy, y_max_shared[k]); - } - } - - // write block result to global memory - if (i == 0) { - k = blockIdx.x; - x_min_device[k] = minx; - x_max_device[k] = maxx; - y_min_device[k] = miny; - y_max_device[k] = maxy; - __threadfence(); - - inc = gridDim.x - 1; - if (inc == atomicInc(&blkcntd, inc)) { - // I'm the last block, so combine all block results - for (j = 0; j <= inc; j++) { - minx = fminf(minx, x_min_device[j]); - maxx = fmaxf(maxx, x_max_device[j]); - miny = fminf(miny, y_min_device[j]); - maxy = fmaxf(maxy, y_max_device[j]); - } - - // compute 'radius' - radiusd = fmaxf(maxx - minx, maxy - miny) * 0.5f + 1e-5f; - - // create root node - k = num_nodes; - bottomd = k; - - cell_mass[k] = -1.0f; - cell_starts[k] = 0; - x_pos_device[k] = (minx + maxx) * 0.5f; - y_pos_device[k] = (miny + maxy) * 0.5f; - k *= 4; - for (i = 0; i < 4; i++) children[k + i] = -1; - - stepd++; - } - } -} - -void tsnecuda::bh::ComputeBoundingBox(tsnecuda::GpuOptions &gpu_opt, - thrust::device_vector &cell_starts, - thrust::device_vector &children, - thrust::device_vector &cell_mass, - thrust::device_vector &points, - thrust::device_vector &x_max_device, - thrust::device_vector &y_max_device, - thrust::device_vector &x_min_device, - thrust::device_vector &y_min_device, - const int num_nodes, - const int num_points, - const int num_blocks) -{ - tsnecuda::bh::BoundingBoxKernel<<>>( - thrust::raw_pointer_cast(cell_starts.data()), - thrust::raw_pointer_cast(children.data()), - thrust::raw_pointer_cast(cell_mass.data()), - thrust::raw_pointer_cast(points.data()), - thrust::raw_pointer_cast(points.data() + num_nodes + 1), - thrust::raw_pointer_cast(x_max_device.data()), - thrust::raw_pointer_cast(y_max_device.data()), - thrust::raw_pointer_cast(x_min_device.data()), - thrust::raw_pointer_cast(y_min_device.data()), - num_nodes, num_points, - gpu_opt.bounding_kernel_threads); - GpuErrorCheck(cudaDeviceSynchronize()); -} diff --git a/src/kernels/initialization.cu b/src/kernels/initialization.cu index 31205bc..b7d4259 100644 --- a/src/kernels/initialization.cu +++ b/src/kernels/initialization.cu @@ -1,18 +1,13 @@ -#include "include/kernels/initialization.h" +#include "../include/kernels/initialization.h" /******************************************************************************/ /*** initialize memory ********************************************************/ /******************************************************************************/ -void tsnecuda::bh::Initialize(tsnecuda::GpuOptions &gpu_opt, thrust::device_vector &errd) +void tsnecuda::Initialize(tsnecuda::GpuOptions &gpu_opt, thrust::device_vector &errd) { cudaFuncSetCacheConfig(tsnecuda::bh::IntegrationKernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(tsnecuda::bh::ComputePijxQijKernel, cudaFuncCachePreferShared); GpuErrorCheck(cudaDeviceSynchronize()); } - -void tsnecuda::naive::Initialize() -{ - // TODO: Add cache config sets for naive -} diff --git a/src/kernels/perplexity_search.cu b/src/kernels/perplexity_search.cu index 413e98d..884979f 100644 --- a/src/kernels/perplexity_search.cu +++ b/src/kernels/perplexity_search.cu @@ -5,10 +5,10 @@ Note that FAISS returns the first row as the same point, with distance = 0. pii is defined as zero. */ -#include "include/kernels/perplexity_search.h" +#include "../include/kernels/perplexity_search.h" __global__ -void tsnecuda::bh::ComputePijKernel( +void tsnecuda::ComputePijKernel( volatile float * __restrict__ pij, const float * __restrict__ squared_dist, const float * __restrict__ betas, @@ -29,31 +29,7 @@ void tsnecuda::bh::ComputePijKernel( // condition deals with evaluation of pii // FAISS neighbor zero is i so ignore it - pij[TID] = (j == 0 & dist == 0.0f) ? 0.0f : __expf(-beta * dist); -} - -__global__ -void tsnecuda::naive::ComputePijKernel( - volatile float * __restrict__ pij, - const float * __restrict__ squared_dist, - const float * __restrict__ betas, - const unsigned int num_points) -{ - register int TID, i, j; - register float dist, beta; - - TID = threadIdx.x + blockIdx.x * blockDim.x; - if (TID >= num_points * num_points) return; - - i = TID / num_points; - j = TID % num_points; - - beta = betas[i]; - dist = squared_dist[TID]; - - // condition deals with evaluation of pii - // FAISS neighbor zero is i so ignore it - pij[TID] = (j == i & dist == 0.0f) ? 0.0f : __expf(-beta * dist); + pij[TID] = (j == 0 & dist == 0.0f) ? 0.0f : __expf(-beta * dist); } __global__ @@ -98,7 +74,7 @@ void tsnecuda::PerplexitySearchKernel( found[i] = is_found; } -void tsnecuda::bh::SearchPerplexity(tsnecuda::GpuOptions &gpu_opt, +void tsnecuda::SearchPerplexity(tsnecuda::GpuOptions &gpu_opt, cublasHandle_t &handle, thrust::device_vector &pij, thrust::device_vector &squared_dist, @@ -127,13 +103,13 @@ void tsnecuda::bh::SearchPerplexity(tsnecuda::GpuOptions &gpu_opt, thrust::device_vector row_sum, neg_entropy; do { // compute Gaussian Kernel row - tsnecuda::bh::ComputePijKernel<<>>( + tsnecuda::ComputePijKernel<<>>( thrust::raw_pointer_cast(pij.data()), thrust::raw_pointer_cast(squared_dist.data()), thrust::raw_pointer_cast(betas.data()), num_points, num_near_neighbors); GpuErrorCheck(cudaDeviceSynchronize()); - + // compute entropy of current row row_sum = tsnecuda::util::ReduceSum(handle, pij, num_near_neighbors, num_points, 0); thrust::transform(pij.begin(), pij.end(), entropy.begin(), tsnecuda::util::FunctionalEntropy()); @@ -158,63 +134,3 @@ void tsnecuda::bh::SearchPerplexity(tsnecuda::GpuOptions &gpu_opt, tsnecuda::util::BroadcastMatrixVector(pij, row_sum, num_near_neighbors, num_points, thrust::divides(), 1, 1.0f); } - -void tsnecuda::naive::SearchPerplexity( - cublasHandle_t &handle, - thrust::device_vector &pij, - thrust::device_vector &squared_dist, - const float perplexity_target, - const float epsilon, - const int num_points) -{ - // use beta instead of sigma (this matches the bhtsne code but not the paper) - // beta is just multiplicative instead of divisive (changes the way binary search works) - thrust::device_vector betas(num_points, 1.0f); - thrust::device_vector lower_bound_beta(num_points, -FLT_MAX); - thrust::device_vector upper_bound_beta(num_points, FLT_MAX); - thrust::device_vector entropy(num_points * num_points); - thrust::device_vector found(num_points); - - // TODO: this doesn't really fit with the style - const int BLOCKSIZE1 = 1024; - const int NBLOCKS1 = iDivUp(num_points * num_points, BLOCKSIZE1); - - const int BLOCKSIZE2 = 128; - const int NBLOCKS2 = iDivUp(num_points, BLOCKSIZE2); - - size_t iters = 0; - int all_found = 0; - thrust::device_vector row_sum, neg_entropy; - do { - // compute Gaussian Kernel row - tsnecuda::naive::ComputePijKernel<<>>( - thrust::raw_pointer_cast(pij.data()), - thrust::raw_pointer_cast(squared_dist.data()), - thrust::raw_pointer_cast(betas.data()), - num_points); - GpuErrorCheck(cudaDeviceSynchronize()); - - // compute entropy of current row - row_sum = tsnecuda::util::ReduceSum(handle, pij, num_points, num_points, 0); - thrust::transform(pij.begin(), pij.end(), entropy.begin(), tsnecuda::util::FunctionalEntropy()); - neg_entropy = tsnecuda::util::ReduceAlpha(handle, entropy, num_points, num_points, -1.0f, 0); - - // binary search for beta - tsnecuda::PerplexitySearchKernel<<>>( - thrust::raw_pointer_cast(betas.data()), - thrust::raw_pointer_cast(lower_bound_beta.data()), - thrust::raw_pointer_cast(upper_bound_beta.data()), - thrust::raw_pointer_cast(found.data()), - thrust::raw_pointer_cast(neg_entropy.data()), - thrust::raw_pointer_cast(row_sum.data()), - perplexity_target, epsilon, num_points); - GpuErrorCheck(cudaDeviceSynchronize()); - - // Check if searching is done - all_found = thrust::reduce(found.begin(), found.end(), 1, thrust::minimum()); - iters++; - } while (!all_found && iters < 200); - // TODO: Warn if iters == 200 because perplexity not found? - - tsnecuda::util::BroadcastMatrixVector(pij, row_sum, num_points, num_points, thrust::divides(), 1, 1.0f); -} diff --git a/src/kernels/tree_builder.cu b/src/kernels/tree_builder.cu deleted file mode 100644 index 3e3c8df..0000000 --- a/src/kernels/tree_builder.cu +++ /dev/null @@ -1,189 +0,0 @@ -/* - Builds a quad tree for computing Barnes-Hut approximation of t-SNE repulsive forces. -*/ - -#include "include/kernels/tree_builder.h" - -/******************************************************************************/ -/*** build tree ***************************************************************/ -/******************************************************************************/ - -__global__ -void tsnecuda::bh::ClearKernel1(volatile int * __restrict__ children, const int num_nodes, const int num_points) -{ - register int k, inc, top, bottom; - - top = 4 * num_nodes; - bottom = 4 * num_points; - inc = blockDim.x * gridDim.x; - k = (bottom & (-32)) + threadIdx.x + blockIdx.x * blockDim.x; // align to warp size - if (k < bottom) k += inc; - - // iterate over all cells assigned to thread - while (k < top) { - children[k] = -1; - k += inc; - } -} - - -__global__ -void tsnecuda::bh::TreeBuildingKernel(volatile int * __restrict__ errd, - volatile int * __restrict__ children, - volatile float * __restrict__ x_pos_device, - volatile float * __restrict__ y_pos_device, - const int num_nodes, - const int num_points) -{ - register int i, j, depth, localmaxdepth, skip, inc; - register float x, y, r; - register float px, py; - register float dx, dy; - register int ch, n, cell, locked, patch; - register float radius, rootx, rooty; - - // cache root data - radius = radiusd; - rootx = x_pos_device[num_nodes]; - rooty = y_pos_device[num_nodes]; - - localmaxdepth = 1; - skip = 1; - inc = blockDim.x * gridDim.x; - i = threadIdx.x + blockIdx.x * blockDim.x; - - // iterate over all bodies assigned to thread - while (i < num_points) { - if (skip != 0) { - // new body, so start traversing at root - skip = 0; - px = x_pos_device[i]; - py = y_pos_device[i]; - n = num_nodes; - depth = 1; - r = radius * 0.5f; - dx = dy = -r; - j = 0; - // determine which child to follow - if (rootx < px) {j = 1; dx = r;} - if (rooty < py) {j |= 2; dy = r;} - x = rootx + dx; - y = rooty + dy; - } - - // follow path to leaf cell - ch = children[n*4+j]; - while (ch >= num_points) { - n = ch; - depth++; - r *= 0.5f; - dx = dy = -r; - j = 0; - // determine which child to follow - if (x < px) {j = 1; dx = r;} - if (y < py) {j |= 2; dy = r;} - x += dx; - y += dy; - ch = children[n*4+j]; - } - if (ch != -2) { // skip if child pointer is locked and try again later - locked = n*4+j; - if (ch == -1) { - if (-1 == atomicCAS((int *)&children[locked], -1, i)) { // if null, just insert the new body - localmaxdepth = max(depth, localmaxdepth); - i += inc; // move on to next body - skip = 1; - } - } else { // there already is a body in this position - if (ch == atomicCAS((int *)&children[locked], ch, -2)) { // try to lock - patch = -1; - // create new cell(s) and insert the old and new body - do { - depth++; - - cell = atomicSub((int *)&bottomd, 1) - 1; - if (cell <= num_points) { - *errd = 1; - bottomd = num_nodes; - } - - if (patch != -1) { - children[n*4+j] = cell; - } - patch = max(patch, cell); - j = 0; - if (x < x_pos_device[ch]) j = 1; - if (y < y_pos_device[ch]) j |= 2; - children[cell*4+j] = ch; - n = cell; - r *= 0.5f; - dx = dy = -r; - j = 0; - if (x < px) {j = 1; dx = r;} - if (y < py) {j |= 2; dy = r;} - x += dx; - y += dy; - ch = children[n*4+j]; - // repeat until the two bodies are different children - } while (ch >= 0 && r > 1e-10); // add radius check because bodies that are very close together can cause this to fail... if points are too close together it will exceed the max depth of the tree - children[n*4+j] = i; - - localmaxdepth = max(depth, localmaxdepth); - i += inc; // move on to next body - skip = 2; - } - } - } - __threadfence(); - - if (skip == 2) { - children[locked] = patch; - } - } - // record maximum tree depth - atomicMax((int *)&maxdepthd, localmaxdepth); -} - - -__global__ -void tsnecuda::bh::ClearKernel2(volatile int * __restrict__ cell_starts, volatile float * __restrict__ cell_mass, const int num_nodes) -{ - register int k, inc, bottom; - - bottom = bottomd; - inc = blockDim.x * gridDim.x; - k = (bottom & (-32)) + threadIdx.x + blockIdx.x * blockDim.x; // align to warp size - if (k < bottom) k += inc; - - // iterate over all cells assigned to thread - while (k < num_nodes) { - cell_mass[k] = -1.0f; - cell_starts[k] = -1; - k += inc; - } -} - -void tsnecuda::bh::BuildTree(tsnecuda::GpuOptions &gpu_opt, - thrust::device_vector &errd, - thrust::device_vector &children, - thrust::device_vector &cell_starts, - thrust::device_vector &cell_mass, - thrust::device_vector &points, - const int num_nodes, - const int num_points, - const int num_blocks) -{ - tsnecuda::bh::ClearKernel1<<>>(thrust::raw_pointer_cast(children.data()), - num_nodes, num_points); - tsnecuda::bh::TreeBuildingKernel<<>>( - thrust::raw_pointer_cast(errd.data()), - thrust::raw_pointer_cast(children.data()), - thrust::raw_pointer_cast(points.data()), - thrust::raw_pointer_cast(points.data() + num_nodes + 1), - num_nodes, num_points); - tsnecuda::bh::ClearKernel2<<>>(thrust::raw_pointer_cast(cell_starts.data()), - thrust::raw_pointer_cast(cell_mass.data()), - num_nodes); - GpuErrorCheck(cudaDeviceSynchronize()); -} diff --git a/src/kernels/tree_sort.cu b/src/kernels/tree_sort.cu deleted file mode 100644 index 67235c8..0000000 --- a/src/kernels/tree_sort.cu +++ /dev/null @@ -1,72 +0,0 @@ -/* - Sort points and cells by morton code. -*/ - -#include "include/kernels/tree_sort.h" - -/******************************************************************************/ -/*** sort bodies **************************************************************/ -/******************************************************************************/ - -__global__ -void tsnecuda::bh::SortKernel(int * __restrict__ cell_sorted, - volatile int * __restrict__ cell_starts, - int * __restrict__ children, - const int * __restrict__ cell_counts, - const int num_nodes, - const int num_points) -{ - register int i, j, k, ch, dec, start, bottom; - - bottom = bottomd; - dec = blockDim.x * gridDim.x; - k = num_nodes + 1 - dec + threadIdx.x + blockIdx.x * blockDim.x; - - // iterate over all cells assigned to thread - while (k >= bottom) { - start = cell_starts[k]; - if (start >= 0) { - j = 0; - for (i = 0; i < 4; i++) { - ch = children[k*4+i]; - if (ch >= 0) { - if (i != j) { - // move children to front (needed later for speed) - children[k*4+i] = -1; - children[k*4+j] = ch; - } - j++; - if (ch >= num_points) { - // child is a cell - cell_starts[ch] = start; // set start ID of child - start += cell_counts[ch]; // add #bodies in subtree - } else { - // child is a body - cell_sorted[start] = ch; // record body in 'sorted' array - start++; - } - } - } - k -= dec; // move on to next cell - } - } -} - -void tsnecuda::bh::SortCells(tsnecuda::GpuOptions &gpu_opt, - thrust::device_vector &cell_sorted, - thrust::device_vector &cell_starts, - thrust::device_vector &children, - thrust::device_vector &cell_counts, - const int num_nodes, - const int num_points, - const int num_blocks) -{ - tsnecuda::bh::SortKernel<<>>( - thrust::raw_pointer_cast(cell_sorted.data()), - thrust::raw_pointer_cast(cell_starts.data()), - thrust::raw_pointer_cast(children.data()), - thrust::raw_pointer_cast(cell_counts.data()), - num_nodes, num_points); - GpuErrorCheck(cudaDeviceSynchronize()); -} diff --git a/src/kernels/tree_summary.cu b/src/kernels/tree_summary.cu deleted file mode 100644 index 0f646d1..0000000 --- a/src/kernels/tree_summary.cu +++ /dev/null @@ -1,166 +0,0 @@ -/* - Summarize position and mass of cells in quad-tree. -*/ - -#include "include/kernels/tree_summary.h" - -/******************************************************************************/ -/*** compute center of mass ***************************************************/ -/******************************************************************************/ - -__global__ -void tsnecuda::bh::SummarizationKernel( - volatile int * __restrict cell_counts, - volatile float * __restrict cell_mass, - volatile float * __restrict x_pos_device, - volatile float * __restrict y_pos_device, - const int * __restrict children, - const int num_nodes, - const int num_points, - const int gpu_summary_threads) -{ - register int i, j, k, ch, inc, cnt, bottom, flag; - register float m, cm, px, py; - extern __shared__ int summary_shared_memory[]; - int* child = (int*) summary_shared_memory; - float* mass = (float*) (child + 4*gpu_summary_threads); - - bottom = bottomd; - inc = blockDim.x * gridDim.x; - k = (bottom & (-32)) + threadIdx.x + blockIdx.x * blockDim.x; // align to warp size - if (k < bottom) k += inc; - - register int restart = k; - for (j = 0; j < 5; j++) { // wait-free pre-passes - // iterate over all cells assigned to thread - while (k <= num_nodes) { - if (cell_mass[k] < 0.0f) { - for (i = 0; i < 4; i++) { - ch = children[k*4+i]; - child[i*gpu_summary_threads+threadIdx.x] = ch; // cache children - if ((ch >= num_points) && ((mass[i*gpu_summary_threads+threadIdx.x] = cell_mass[ch]) < 0.0f)) { - break; - } - } - if (i == 4) { - // all children are ready - cm = 0.0f; - px = 0.0f; - py = 0.0f; - cnt = 0; - for (i = 0; i < 4; i++) { - ch = child[i*gpu_summary_threads+threadIdx.x]; - if (ch >= 0) { - if (ch >= num_points) { // count bodies (needed later) - m = mass[i*gpu_summary_threads+threadIdx.x]; - cnt += cell_counts[ch]; - } else { - m = cell_mass[ch]; - cnt++; - } - // add child's contribution - cm += m; - px += x_pos_device[ch] * m; - py += y_pos_device[ch] * m; - } - } - cell_counts[k] = cnt; - m = 1.0f / cm; - x_pos_device[k] = px * m; - y_pos_device[k] = py * m; - __threadfence(); // make sure data are visible before setting mass - cell_mass[k] = cm; - } - } - k += inc; // move on to next cell - } - k = restart; - } - - flag = 0; - j = 0; - // iterate over all cells assigned to thread - while (k <= num_nodes) { - if (cell_mass[k] >= 0.0f) { - k += inc; - } else { - if (j == 0) { - j = 4; - for (i = 0; i < 4; i++) { - ch = children[k*4+i]; - child[i*gpu_summary_threads+threadIdx.x] = ch; // cache children - if ((ch < num_points) || ((mass[i*gpu_summary_threads+threadIdx.x] = cell_mass[ch]) >= 0.0f)) { - j--; - } - } - } else { - j = 4; - for (i = 0; i < 4; i++) { - ch = child[i*gpu_summary_threads+threadIdx.x]; - if ((ch < num_points) || (mass[i*gpu_summary_threads+threadIdx.x] >= 0.0f) || ((mass[i*gpu_summary_threads+threadIdx.x] = cell_mass[ch]) >= 0.0f)) { - j--; - } - } - } - - if (j == 0) { - // all children are ready - cm = 0.0f; - px = 0.0f; - py = 0.0f; - cnt = 0; - for (i = 0; i < 4; i++) { - ch = child[i*gpu_summary_threads+threadIdx.x]; - if (ch >= 0) { - if (ch >= num_points) { // count bodies (needed later) - m = mass[i*gpu_summary_threads+threadIdx.x]; - cnt += cell_counts[ch]; - } else { - m = cell_mass[ch]; - cnt++; - } - // add child's contribution - cm += m; - px += x_pos_device[ch] * m; - py += y_pos_device[ch] * m; - } - } - cell_counts[k] = cnt; - m = 1.0f / cm; - x_pos_device[k] = px * m; - y_pos_device[k] = py * m; - flag = 1; - } - } - __syncthreads(); - __threadfence(); - if (flag != 0) { - cell_mass[k] = cm; - k += inc; - flag = 0; - } - } -} - -void tsnecuda::bh::SummarizeTree(tsnecuda::GpuOptions &gpu_opt, - thrust::device_vector &cell_counts, - thrust::device_vector &children, - thrust::device_vector &cell_mass, - thrust::device_vector &pts_device, - const int num_nodes, - const int num_points, - const int num_blocks) -{ - tsnecuda::bh::SummarizationKernel<<>>( - thrust::raw_pointer_cast(cell_counts.data()), - thrust::raw_pointer_cast(cell_mass.data()), - thrust::raw_pointer_cast(pts_device.data()), - thrust::raw_pointer_cast(pts_device.data() + num_nodes + 1), - thrust::raw_pointer_cast(children.data()), - num_nodes, num_points, - gpu_opt.summary_kernel_threads); - GpuErrorCheck(cudaDeviceSynchronize()); -} diff --git a/src/naive_tsne.cu b/src/naive_tsne.cu deleted file mode 100644 index 9785b6d..0000000 --- a/src/naive_tsne.cu +++ /dev/null @@ -1,416 +0,0 @@ -/** - * @brief Implementation of naive T-SNE - * - * @file naive_tsne.cu - * @author David Chan - * @date 2018-04-04 - */ - -#include "naive_tsne.h" - -__global__ void upper_lower_assign(float * __restrict__ sigmas, - float * __restrict__ lower_bound, - float * __restrict__ upper_bound, - const float * __restrict__ perplexity, - const float target_perplexity, - const unsigned int N) -{ - int TID = threadIdx.x + blockIdx.x * blockDim.x; - if (TID > N) return; - - if (perplexity[TID] > target_perplexity) - upper_bound[TID] = sigmas[TID]; - else - lower_bound[TID] = sigmas[TID]; - sigmas[TID] = (upper_bound[TID] + lower_bound[TID])/2.0f; -} - -// TODO: replace this with the bhtsne version -void NaiveTSNE::thrust_search_perplexity(cublasHandle_t &handle, - thrust::device_vector &sigmas, - thrust::device_vector &lower_bound, - thrust::device_vector &upper_bound, - thrust::device_vector &perplexity, - const thrust::device_vector &pij, - const float target_perplexity, - const unsigned int N) -{ - // std::cout << "pij:" << std::endl; - // printarray(pij, N, N); - // std::cout << std::endl; - thrust::device_vector entropy_(pij.size()); - thrust::transform(pij.begin(), pij.end(), entropy_.begin(), tsnecuda::util::FunctionalEntropy()); - tsnecuda::util::ZeroDeviceMatrixDiagonal(entropy_, N); - - // std::cout << "entropy:" << std::endl; - // printarray(entropy_, N, N); - // std::cout << std::endl; - - auto neg_entropy = tsnecuda::util::ReduceAlpha(handle, entropy_, N, N, -1.0f, 1); - - // std::cout << "neg_entropy:" << std::endl; - // printarray(neg_entropy, 1, N); - // std::cout << std::endl; - thrust::transform(neg_entropy.begin(), neg_entropy.end(), perplexity.begin(), tsnecuda::util::FunctionalPower2()); - // std::cout << "perplexity:" << std::endl; - // printarray(perplexity, 1, N); - // std::cout << std::endl; - - const unsigned int BLOCKSIZE = 32; - const unsigned int NBLOCKS = iDivUp(N, BLOCKSIZE); - upper_lower_assign<<>>(thrust::raw_pointer_cast(sigmas.data()), - thrust::raw_pointer_cast(lower_bound.data()), - thrust::raw_pointer_cast(upper_bound.data()), - thrust::raw_pointer_cast(perplexity.data()), - target_perplexity, - N); - // std::cout << "sigmas" << std::endl; - // printarray(sigmas, 1, N); - // std::cout << std::endl; - -} - -thrust::device_vector NaiveTSNE::search_perplexity(cublasHandle_t &handle, - thrust::device_vector &points, - const float perplexity_target, - const float eps, - const unsigned int N, - const unsigned int NDIMS) { - - thrust::device_vector sigmas(N, 500.0f); - thrust::device_vector best_sigmas(N); - thrust::device_vector perplexity(N); - thrust::device_vector lbs(N, 0.0f); - thrust::device_vector ubs(N, 1000.0f); - - thrust::device_vector pij(N*N); - NaiveTSNE::compute_pij(handle, pij, points, sigmas, N, NDIMS); - float best_perplexity = 1000.0f; - float perplexity_diff = 50.0f; - int iters = 0; - while (perplexity_diff > eps) { - NaiveTSNE::thrust_search_perplexity(handle, sigmas, lbs, ubs, perplexity, pij, perplexity_target, N); - perplexity_diff = abs(thrust::reduce(perplexity.begin(), perplexity.end())/((float) N) - perplexity_target); - if (perplexity_diff < best_perplexity){ - best_perplexity = perplexity_diff; - // printf("!! Best perplexity found in %d iterations: %0.5f\n", iters, perplexity_diff); - thrust::copy(sigmas.begin(), sigmas.end(), best_sigmas.begin()); - } - NaiveTSNE::compute_pij(handle, pij, points, sigmas, N, NDIMS); - iters++; - } // Close perplexity search - - return best_sigmas; -} - -// TODO: put this in the same file as bhtsne compute_pij -void NaiveTSNE::compute_pij( - cublasHandle_t &handle, - thrust::device_vector &pij, - const thrust::device_vector &points, - const thrust::device_vector &sigma, - const unsigned int N, - const unsigned int NDIMS) -{ - tsnecuda::util::SquaredPairwiseDistance(handle, pij, points, N, NDIMS); - - // std::cout << "Sigma:" << std::endl; - // printarray(sigma, N, 1); - thrust::device_vector sigma_squared(sigma.size()); - tsnecuda::util::SquareDeviceVector(sigma_squared, sigma); - - // divide column by sigmas (matrix[i,:] gets divided by sigma_i^2) - tsnecuda::util::BroadcastMatrixVector(pij, sigma_squared, N, N, thrust::divides(), 0, -2.0f); - thrust::transform(pij.begin(), pij.end(), pij.begin(), tsnecuda::util::FunctionalExp()); - tsnecuda::util::ZeroDeviceMatrixDiagonal(pij, N); - - // tsnecuda::util::ReduceSum over cols? rows? Fuck if I know. - auto sums = tsnecuda::util::ReduceSum(handle, pij, N, N, 1); - - // divide column by resulting vector - tsnecuda::util::BroadcastMatrixVector(pij, sums, N, N, thrust::divides(), 0, 1.0f); -} - -void NaiveTSNE::symmetrize_pij(cublasHandle_t &handle, - thrust::device_vector &pij, - const unsigned int N) -{ - float alpha = 0.5f; - float beta = 0.5f; - thrust::device_vector pij_out(N*N); - CublasSafeCall(cublasSgeam(handle, CUBLAS_OP_N, CUBLAS_OP_T, N, N, &alpha, thrust::raw_pointer_cast(pij.data()), N, - &beta, thrust::raw_pointer_cast(pij.data()), N, thrust::raw_pointer_cast(pij_out.data()), N)); - pij = pij_out; -} - - -/** - * Gradient formula from http://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf - * - * Given by -> - * forces_i = 4 * \sum_j (pij - qij)(yi - yj)(1 + ||y_i - y_j||^2)^-1 - * - * Notation below - in comments, actual variables in the code are referred to by _ to differentiate from the mathematical quantities - * It's hard to name variables correctly because we don't want to keep allocating more memory. There's probably a better solution than this though. - */ -float NaiveTSNE::compute_gradients(cublasHandle_t &handle, - thrust::device_vector &forces, - thrust::device_vector &dist, - thrust::device_vector &ys, - thrust::device_vector &pij, - thrust::device_vector &qij, - const unsigned int N, - const unsigned int PROJDIM, - float eta) -{ - // dist_ = ||y_i - y_j||^2 - tsnecuda::util::SquaredPairwiseDistance(handle, dist, ys, N, PROJDIM); - - // std::cout << std::endl << std::endl << "Dist" << std::endl; - // printarray(dist, N, N); - - // dist_ = (1 + ||y_i - y_j||^2)^-1 - thrust::transform(dist.begin(), dist.end(), dist.begin(), tsnecuda::util::FunctionalIncrementInverse()); - tsnecuda::util::ZeroDeviceMatrixDiagonal(dist, N); - - // std::cout << std::endl << std::endl << "Inc-Inv Dist" << std::endl; - // printarray(dist, N, N); - - // qij_ = (1 + ||y_i - y_j||^2)^-1 / \Sum_{k != i} (1 + ||y_i - y_k||^2)^-1 - float sum = thrust::reduce(dist.begin(), dist.end(), 0.0f, thrust::plus()); - thrust::transform(dist.begin(), dist.end(), thrust::make_constant_iterator(sum), qij.begin(), thrust::divides()); - - // auto sums = tsnecuda::util::ReduceSum(handle, qij, N, N, 1); - - // std::cout << std::endl << std::endl << "Sum-Dist" << std::endl; - // printarray(sums, 1, N); - - // tsnecuda::util::BroadcastMatrixVector(qij, sums, N, N, thrust::divides(), 0, 1.0f); - - // std::cout << std::endl << std::endl << "Qij" << std::endl; - // printarray(qij, N, N); - - // Compute loss = \sum_ij pij * log(pij / qij) - thrust::device_vector loss_(N * N); - thrust::transform(pij.begin(), pij.end(), qij.begin(), loss_.begin(), tsnecuda::util::FunctionalKlDivergence()); - tsnecuda::util::ZeroDeviceMatrixDiagonal(loss_, N); - - // printarray(loss_, N, N); - float loss = thrust::reduce(loss_.begin(), loss_.end(), 0.0f, thrust::plus()); - - // qij_ = pij - qij - thrust::transform(pij.begin(), pij.end(), qij.begin(), qij.begin(), thrust::minus()); - - // std::cout << std::endl << std::endl << "Pij-Qij" << std::endl; - // printarray(qij, N, N); - - // qij_ = (pij - qij)(1 + ||y_i - y_j||^2)^-1 - thrust::transform(qij.begin(), qij.end(), dist.begin(), qij.begin(), thrust::multiplies()); - - // std::cout << std::endl << std::endl << "A" << std::endl; - // printarray(qij, N, N); - - // forces_ = \sum_j (pij - qij)(1 + ||y_i - y_j||^2)^-1 - float alpha = 1.0f; - float beta = 0.0f; - thrust::device_vector ones(PROJDIM * N, 1.0f); - CublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, PROJDIM, N, &alpha, - thrust::raw_pointer_cast(qij.data()), N, thrust::raw_pointer_cast(ones.data()), N, &beta, - thrust::raw_pointer_cast(forces.data()), N)); - - // std::cout << std::endl << std::endl << "A * 1" << std::endl; - // printarray(forces, 2, N); - - // forces_ = y_i * \sum_j (pij - qij)(1 + ||y_i - y_j||^2)^-1 - thrust::transform(forces.begin(), forces.end(), ys.begin(), forces.begin(), thrust::multiplies()); - alpha = eta; - beta = -eta; - // forces_ = 4 * y_i * \sum_j (pij - qij)(1 + ||y_i - y_j||^2)^-1 - 4 * \sum_j y_j(pij - qij)(1 + ||y_i - y_j||^2)^-1 - CublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, PROJDIM, N, &alpha, - thrust::raw_pointer_cast(qij.data()), N, thrust::raw_pointer_cast(ys.data()), N, &beta, - thrust::raw_pointer_cast(forces.data()), N)); - - // std::cout << std::endl << std::endl << "Final Forces" << std::endl; - // printarray(forces, 2, N); - - return loss; -} - -thrust::device_vector NaiveTSNE::tsne(cublasHandle_t &handle, - thrust::device_vector &points, - const unsigned int N, - const unsigned int NDIMS, - const unsigned int PROJDIM) { - tsnecuda::util::MaxNormalizeDeviceVector(points); - std::default_random_engine generator(time(NULL)); - - // Choose the right sigmas - std::cout << "Selecting sigmas to match perplexity..." << std::endl; - - //TODO: Fix perplexity search - - float perplexity_target = 8.0f; - float eps = 1e-2; - - thrust::device_vector sigmas = NaiveTSNE::search_perplexity(handle, points, perplexity_target, eps, N, NDIMS); - thrust::device_vector pij(N*N); - NaiveTSNE::compute_pij(handle, pij, points, sigmas, N, NDIMS); - NaiveTSNE::symmetrize_pij(handle, pij, N); - - //std::cout << "Pij" << std::endl; - //printarray(pij, N, N); - - thrust::device_vector forces(N * PROJDIM); - thrust::device_vector ys = tsnecuda::util::RandomDeviceUniformZeroOneVector(generator, N * PROJDIM); - - // Momentum variables - thrust::device_vector yt_1(N * PROJDIM); - thrust::device_vector momentum(N * PROJDIM); - float momentum_weight = 0.8f; - - - //printarray(ys, N, 2); - thrust::device_vector qij(N * N); - thrust::device_vector dist(N * N); - float eta = 1.00f; - float loss = 0.0f; - - - #ifdef DEBUG - // Dump the original points - std::ofstream dump_points_file; - dump_points_file.open ("dump_points.txt"); - dump_points_file << N << " " << NDIMS << std::endl; - for (int i = 0; i < N; i++) { - for (int j = 0; j < NDIMS; j++) { - dump_points_file << points[i + j*N] << " "; - } - dump_points_file << std::endl; - } - dump_points_file.close(); - #endif - - - #ifdef DEBUG - // Create a dump file for the points - std::ofstream dump_file; - dump_file.open("dump_ys.txt"); - float host_ys[N * PROJDIM]; - dump_file << N << " " << PROJDIM << std::endl; - #endif - - for (int i = 0; i < 1000; i++) { - loss = NaiveTSNE::compute_gradients(handle, forces, dist, ys, pij, qij, N, PROJDIM, eta); - - // Compute the momentum - thrust::transform(ys.begin(), ys.end(), yt_1.begin(), momentum.begin(), thrust::minus()); - thrust::transform(momentum.begin(), momentum.end(), thrust::make_constant_iterator(momentum_weight), momentum.begin(), thrust::multiplies() ); - thrust::copy(ys.begin(), ys.end(), yt_1.begin()); - - // Apply the forces - thrust::transform(ys.begin(), ys.end(), forces.begin(), ys.begin(), thrust::plus()); - thrust::transform(ys.begin(), ys.end(), momentum.begin(), ys.begin(), thrust::plus()); - - //TODO: Add early termination for loss deltas - - if (i % 100 == 0) - std::cout << "Iteration: " << i << ", Loss: " << loss << ", ForceMag: " << tsnecuda::util::L2NormDeviceVector(forces) << std::endl; - - #ifdef DEBUG - // Dump the points - thrust::copy(ys.begin(), ys.end(), host_ys); - for (int i = 0; i < N; i++) { - for (int j = 0; j < PROJDIM; j++) { - dump_file << host_ys[i + j*N] << " "; - } - dump_file << std::endl; - } - #endif - } - #ifdef DEBUG - dump_file.close(); - #endif - return ys; -} - -thrust::device_vector NaiveTSNE::tsne(cublasHandle_t &handle, - thrust::device_vector &d_points, - unsigned int N_POINTS, - unsigned int N_DIMS, - unsigned int PROJDIM, - float perplexity, - float early_ex, - float learning_rate, - unsigned int n_iter, - unsigned int n_iter_np, - float min_g_norm){ - - // Compute a max-norm on the points - tsnecuda::util::MaxNormalizeDeviceVector(d_points); - - // Setup a random generator - std::default_random_engine generator(time(NULL)); - - // Choose the right sigmas - std::cout << "Selecting sigmas to match perplexity..." << std::endl; - float eps = 1e-2; - thrust::device_vector sigmas = NaiveTSNE::search_perplexity(handle, d_points, perplexity, eps, N_POINTS, N_DIMS); - thrust::device_vector pij(N_POINTS*N_POINTS); - NaiveTSNE::compute_pij(handle, pij, d_points, sigmas, N_POINTS, N_DIMS); - NaiveTSNE::symmetrize_pij(handle, pij, N_POINTS); - - // Allocate some memory for the foces and such - thrust::device_vector forces(N_POINTS * PROJDIM); - thrust::device_vector ys = tsnecuda::util::RandomDeviceUniformZeroOneVector(generator, N_POINTS * PROJDIM); - - // Momentum variables - thrust::device_vector yt_1(N_POINTS * PROJDIM); - thrust::device_vector momentum(N_POINTS * PROJDIM); - - - // Qij and distance vector allocations - thrust::device_vector qij(N_POINTS * N_POINTS); - thrust::device_vector dist(N_POINTS * N_POINTS); - - // Setup the learning rate - float eta = learning_rate*early_ex; - float momentum_weight = 0.5f; - float loss = 0.0f; - bool using_early = true; - - float best_error = 0.0; - int best_iter = 0; - - for (int i = 0; i < n_iter; i++) { - - // Check for and turn off early exaggeration - if (using_early) { if (i > 250) { - using_early = false; - eta /= early_ex; - momentum_weight = 0.8; - }} - - // Compute the loss/gradients - loss = NaiveTSNE::compute_gradients(handle, forces, dist, ys, pij, qij, N_POINTS, PROJDIM, eta); - - // Compute the momentum - thrust::transform(ys.begin(), ys.end(), yt_1.begin(), momentum.begin(), thrust::minus()); - thrust::transform(momentum.begin(), momentum.end(), thrust::make_constant_iterator(momentum_weight), momentum.begin(), thrust::multiplies() ); - thrust::copy(ys.begin(), ys.end(), yt_1.begin()); - - // Apply the forces - thrust::transform(ys.begin(), ys.end(), forces.begin(), ys.begin(), thrust::plus()); - thrust::transform(ys.begin(), ys.end(), momentum.begin(), ys.begin(), thrust::plus()); - - // Terminate if we're not changing loss much - if (loss < best_error || i == 0) { - best_error = loss; - best_iter = i; - } else {if (i - best_iter > n_iter_np) break;} - - // Terminate if we're less than the minimum gradient norm - if (tsnecuda::util::L2NormDeviceVector(forces) < min_g_norm) break; - } - return ys; -} - diff --git a/src/naive_tsne_cpu.cu b/src/naive_tsne_cpu.cu deleted file mode 100644 index ba57ea0..0000000 --- a/src/naive_tsne_cpu.cu +++ /dev/null @@ -1,183 +0,0 @@ -#include "common.h" - -std::vector squared_pairwise_dist(std::vector &points, const unsigned int N, const unsigned int NDIMS) { - std::vector squared_pairwise_dist(N * N, 0); - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - for(int k = 0; k < NDIMS; k++) { - squared_pairwise_dist[i * N + j] += (points[i * NDIMS + k] - points[j * NDIMS + k]) * (points[i * NDIMS + k] - points[j * NDIMS + k]); - } - } - } - return squared_pairwise_dist; -} - - - -bool perplexity_equal(const float delta, float perplexity, float target_perplexity) { - return (perplexity >= target_perplexity - delta) && (perplexity <= target_perplexity + delta); -} - - -float get_perplexity(std::vector & pij, const unsigned int i, unsigned int N) { - float entropy = 0.0f; - for (int j = 0; j < N; j++) { - if (j != i) { - entropy += pij[i*N + j] * std::log2(pij[i*N + j]); - } - } - return std::pow(2, -entropy); -} - -bool compare_perplexity(std::vector& pij, - float& lo, - float& mid, - float& hi, - const unsigned int i, - const unsigned int N, - const float delta, - const float target_perplexity) { - float perplexity = get_perplexity(pij, i, N); - if (perplexity_equal(delta, perplexity, target_perplexity)) { - return true; - } else if (perplexity > target_perplexity) { - hi = mid - delta; - } else { - lo = mid + delta; - } - - mid = (lo + hi)/2; - return false; -} - - -void recompute_pij_row_cpu(std::vector &points, - std::vector &pij, - float sigma, - float i, - const unsigned int N, - const unsigned int NDIMS) { - std::vector dists = squared_pairwise_dist(points, N, NDIMS); - for (int j = 0; j < N; j++) { - float denom = 0; - for (int k = 0; k < N; k++) { - if (k != i) { - denom += std::exp(-(dists[i * N + k] / (2 * sigma * sigma))); - } - } - if (i != j) { - pij[i * N + j] = std::exp(-dists[i * N + j] / (2 * sigma * sigma)) / denom; - } - - } -} - -std::vector compute_pij_cpu(std::vector &points, - std::vector &sigma, - const unsigned int N, - const unsigned int NDIMS) { - std::vector pij_out(N * N, 0.0f); - std::vector dists = squared_pairwise_dist(points, N, NDIMS); - - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - float denom = 0; - for (int k = 0; k < N; k++) { - if (k != i) { - denom += std::exp(-(dists[i * N + k] / (2 * sigma[i] * sigma[i]))); - } - } - if (i != j) { - pij_out[i * N + j] = std::exp(-dists[i * N + j] / (2 * sigma[i] * sigma[i])) / denom; - } - - } - } - return pij_out; - -} - -std::vector compute_qij_cpu(std::vector& ys, const unsigned int N, const unsigned int PROJDIMS) { - - std::vector qij_out(N * N); - std::vector dists = squared_pairwise_dist(ys, N, PROJDIMS); - - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - float denom = 0; - for (int k = 0; k < N; k++) { - if (k != i) { - denom += 1 / (1 + (dists[i * N + k])); - } - } - qij_out[i * N + j] = (1 / (1 + dists[i * N + j])) / denom; - } - } - return qij_out; -} - -float kl_div(float pij, float qij) { - return pij * std::log(pij / qij); -} - -float compute_gradients_cpu(std::vector &forces, - std::vector &dist, - std::vector &ys, - std::vector &pij, - std::vector &qij, - const unsigned int N, - float eta) { - - float loss = 0.0f; - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - float pij_sym = (pij[i * N + j] + pij[j * N + i]) / (2 * N); - float qij_sym = (qij[i * N + j] + qij[j * N + i]) / (2 * N); - loss += kl_div(pij_sym, qij_sym); - } - - } - return loss; - -} - - -std::vector sigmas_search_cpu(std::vector &points, - const unsigned int N, - const unsigned int NDIMS, - float target_perplexity) { - const float max_sigma = 1000.0f; - const float delta = 0.1f; - std::vector sigmas(N, max_sigma/2); - std::vector pij = compute_pij_cpu(points, sigmas, N, NDIMS); - for (int i = 0; i < N; i++) { - bool found = false; - float lo = 0.0f; - float hi = max_sigma; - float mid = (lo + hi)/ 2; - while (!found) { - found = compare_perplexity(pij, lo, mid, hi, i, N, delta, target_perplexity); - recompute_pij_row_cpu(points, pij, mid, i, N, NDIMS); - } - sigmas[i] = mid; - } - return sigmas; - -} - -std::vector naive_tsne_cpu(std::vector &points, - const unsigned int N, - const unsigned int NDIMS) { - std::default_random_engine generator; - std::uniform_real_distribution distribution(-10.0f,10.0f); - const unsigned int NPROJDIM = 2; - std::vector ys(N * NPROJDIM); - for (int i = 0; i < N * NPROJDIM; i++) { - ys[i] = distribution(generator); - } - for (int i = 0; i < 1000; i++) { - - } - return ys; - -} diff --git a/src/nbodyfft.cu b/src/nbodyfft.cu index 586e4a1..177116e 100644 --- a/src/nbodyfft.cu +++ b/src/nbodyfft.cu @@ -1,7 +1,4 @@ -#include "nbodyfft.h" -#include -#include "include/util/cuda_utils.h" -#include "include/util/matrix_broadcast_utils.h" +#include "include/nbodyfft.h" #define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) @@ -57,7 +54,7 @@ inline void __cufftSafeCall(cufftResult err, const char *file, const int line) } } -__global__ void copy_to_fft_input(volatile float * __restrict__ fft_input, +__global__ void copy_to_fft_input(volatile float * __restrict__ fft_input, const float * w_coefficients_device, const int n_fft_coeffs, const int n_fft_coeffs_half, @@ -70,14 +67,14 @@ __global__ void copy_to_fft_input(volatile float * __restrict__ fft_input, register int current_term = TID / (n_fft_coeffs_half * n_fft_coeffs_half); register int current_loc = TID % (n_fft_coeffs_half * n_fft_coeffs_half); - + i = current_loc / n_fft_coeffs_half; j = current_loc % n_fft_coeffs_half; fft_input[current_term * (n_fft_coeffs * n_fft_coeffs) + i * n_fft_coeffs + j] = w_coefficients_device[current_term + current_loc * n_terms]; } -__global__ void copy_from_fft_output(volatile float * __restrict__ y_tilde_values, +__global__ void copy_from_fft_output(volatile float * __restrict__ y_tilde_values, const float * fft_output, const int n_fft_coeffs, const int n_fft_coeffs_half, @@ -107,12 +104,12 @@ __global__ void compute_point_box_idx(volatile int * __restrict__ point_box_idx, const float box_width, const int n_boxes, const int n_total_boxes, - const int N) + const int N) { register int TID = threadIdx.x + blockIdx.x * blockDim.x; if (TID >= N) return; - + register int x_idx = (int) ((xs[TID] - coord_min) / box_width); register int y_idx = (int) ((ys[TID] - coord_min) / box_width); @@ -146,7 +143,7 @@ __global__ void interpolate_device( i = TID % N; j = TID / N; - + value = 1; ybox_i = y_in_box[i]; @@ -174,7 +171,7 @@ __global__ void compute_interpolated_indices( TID = threadIdx.x + blockIdx.x * blockDim.x; if (TID >= n_terms * n_interpolation_points * n_interpolation_points * N) return; - + current_term = TID % n_terms; i = (TID / n_terms) % N; interp_j = ((TID / n_terms) / N) % n_interpolation_points; @@ -189,7 +186,7 @@ __global__ void compute_interpolated_indices( (box_j * n_interpolation_points) + interp_j; // interpolated_indices[TID] = idx * n_terms + current_term; atomicAdd( - w_coefficients_device + idx * n_terms + current_term, + w_coefficients_device + idx * n_terms + current_term, x_interpolated_values[i + interp_i * N] * y_interpolated_values[i + interp_j * N] * chargesQij[i * n_terms + current_term]); } @@ -208,7 +205,7 @@ __global__ void compute_potential_indices( TID = threadIdx.x + blockIdx.x * blockDim.x; if (TID >= n_terms * n_interpolation_points * n_interpolation_points * N) return; - + current_term = TID % n_terms; i = (TID / n_terms) % N; interp_j = ((TID / n_terms) / N) % n_interpolation_points; @@ -223,7 +220,7 @@ __global__ void compute_potential_indices( // interpolated_values[TID] = x_interpolated_values[i + interp_i * N] * y_interpolated_values[i + interp_j * N] * y_tilde_values[idx * n_terms + current_term]; // interpolated_indices[TID] = i * n_terms + current_term; atomicAdd( - potentialsQij + i * n_terms + current_term, + potentialsQij + i * n_terms + current_term, x_interpolated_values[i + interp_i * N] * y_interpolated_values[i + interp_j * N] * y_tilde_values[idx * n_terms + current_term]); } @@ -244,7 +241,7 @@ __global__ void compute_kernel_tilde( TID = threadIdx.x + blockIdx.x * blockDim.x; if (TID >= n_interpolation_points_1d * n_interpolation_points_1d) return; - + i = TID / n_interpolation_points_1d; j = TID % n_interpolation_points_1d; @@ -253,8 +250,8 @@ __global__ void compute_kernel_tilde( kernel_tilde[(n_interpolation_points_1d - i) * n_fft_coeffs + (n_interpolation_points_1d + j)] = tmp; kernel_tilde[(n_interpolation_points_1d + i) * n_fft_coeffs + (n_interpolation_points_1d - j)] = tmp; kernel_tilde[(n_interpolation_points_1d - i) * n_fft_coeffs + (n_interpolation_points_1d - j)] = tmp; - -} + +} __global__ void compute_upper_and_lower_bounds( volatile float * __restrict__ box_upper_bounds, @@ -293,9 +290,18 @@ __global__ void copy_to_w_coefficients( w_coefficients_device[output_indices[TID]] = output_values[TID]; } -void precompute_2d(cufftHandle &plan_kernel_tilde, float x_max, float x_min, float y_max, float y_min, int n_boxes, int n_interpolation_points, - thrust::device_vector &box_lower_bounds_device, thrust::device_vector &box_upper_bounds_device, - thrust::device_vector &kernel_tilde_device, thrust::device_vector> &fft_kernel_tilde_device) { +void tsnecuda::PrecomputeFFT2D( + cufftHandle &plan_kernel_tilde, + float x_max, + float x_min, + float y_max, + float y_min, + int n_boxes, + int n_interpolation_points, + thrust::device_vector &box_lower_bounds_device, + thrust::device_vector &box_upper_bounds_device, + thrust::device_vector &kernel_tilde_device, + thrust::device_vector > &fft_kernel_tilde_device) { const int num_threads = 32; int num_blocks = (n_boxes * n_boxes + num_threads - 1) / num_threads; /* @@ -303,7 +309,7 @@ void precompute_2d(cufftHandle &plan_kernel_tilde, float x_max, float x_min, flo */ int n_total_boxes = n_boxes * n_boxes; float box_width = (x_max - x_min) / (float) n_boxes; - + // Left and right bounds of each box, first the lower bounds in the x direction, then in the y direction compute_upper_and_lower_bounds<<>>( thrust::raw_pointer_cast(box_upper_bounds_device.data()), @@ -323,27 +329,27 @@ void precompute_2d(cufftHandle &plan_kernel_tilde, float x_max, float x_min, flo // thrust::device_vector kernel_tilde_device(n_fft_coeffs * n_fft_coeffs); num_blocks = (n_interpolation_points_1d * n_interpolation_points_1d + num_threads - 1) / num_threads; compute_kernel_tilde<<>>( - thrust::raw_pointer_cast(kernel_tilde_device.data()), + thrust::raw_pointer_cast(kernel_tilde_device.data()), x_min, y_min, h, n_interpolation_points_1d, n_fft_coeffs); GpuErrorCheck(cudaDeviceSynchronize()); - + // Precompute the FFT of the kernel generating matrix - - cufftExecR2C(plan_kernel_tilde, + + cufftExecR2C(plan_kernel_tilde, reinterpret_cast(thrust::raw_pointer_cast(kernel_tilde_device.data())), reinterpret_cast(thrust::raw_pointer_cast(fft_kernel_tilde_device.data()))); - + } -void n_body_fft_2d( +void tsnecuda::NbodyFFT2D( cufftHandle &plan_dft, cufftHandle &plan_idft, - int N, - int n_terms, + int N, + int n_terms, int n_boxes, - int n_interpolation_points, + int n_interpolation_points, thrust::device_vector> &fft_kernel_tilde_device, int n_total_boxes, int total_interpolation_points, @@ -355,7 +361,7 @@ void n_body_fft_2d( thrust::device_vector &fft_input, thrust::device_vector> &fft_w_coefficients, thrust::device_vector &fft_output, - thrust::device_vector &point_box_idx_device, + thrust::device_vector &point_box_idx_device, thrust::device_vector &x_in_box_device, thrust::device_vector &y_in_box_device, thrust::device_vector &points_device, @@ -375,13 +381,13 @@ void n_body_fft_2d( // std::cout << "start" << std::endl; const int num_threads = 128; int num_blocks = (N + num_threads - 1) / num_threads; - + // Compute box indices and the relative position of each point in its box in the interval [0, 1] compute_point_box_idx<<>>( thrust::raw_pointer_cast(point_box_idx_device.data()), thrust::raw_pointer_cast(x_in_box_device.data()), thrust::raw_pointer_cast(y_in_box_device.data()), - thrust::raw_pointer_cast(points_device.data()), + thrust::raw_pointer_cast(points_device.data()), thrust::raw_pointer_cast(points_device.data() + num_nodes + 1), thrust::raw_pointer_cast(box_lower_bounds_device.data()), coord_min, @@ -392,11 +398,11 @@ void n_body_fft_2d( ); GpuErrorCheck(cudaDeviceSynchronize()); + /* * Step 1: Interpolate kernel using Lagrange polynomials and compute the w coefficients */ // Compute the interpolated values at each real point with each Lagrange polynomial in the `x` direction - num_blocks = (N * n_interpolation_points + num_threads - 1) / num_threads; interpolate_device<<>>( thrust::raw_pointer_cast(x_interpolated_values_device.data()), @@ -432,10 +438,11 @@ void n_body_fft_2d( n_terms ); GpuErrorCheck(cudaDeviceSynchronize()); + /* * Step 2: Compute the values v_{m, n} at the equispaced nodes, multiply the kernel matrix with the coefficients w */ - + num_blocks = ((n_terms * n_fft_coeffs_half * n_fft_coeffs_half) + num_threads - 1) / num_threads; copy_to_fft_input<<>>( thrust::raw_pointer_cast(fft_input.data()), @@ -446,21 +453,21 @@ void n_body_fft_2d( ); GpuErrorCheck(cudaDeviceSynchronize()); // Compute fft values at interpolated nodes - cufftExecR2C(plan_dft, - reinterpret_cast(thrust::raw_pointer_cast(fft_input.data())), + cufftExecR2C(plan_dft, + reinterpret_cast(thrust::raw_pointer_cast(fft_input.data())), reinterpret_cast(thrust::raw_pointer_cast(fft_w_coefficients.data()))); GpuErrorCheck(cudaDeviceSynchronize()); // Take the broadcasted Hadamard product of a complex matrix and a complex vector tsnecuda::util::BroadcastMatrixVector( - fft_w_coefficients, fft_kernel_tilde_device, n_fft_coeffs * (n_fft_coeffs / 2 + 1), n_terms, + fft_w_coefficients, fft_kernel_tilde_device, n_fft_coeffs * (n_fft_coeffs / 2 + 1), n_terms, thrust::multiplies>(), 0, thrust::complex(1.0)); - + // Invert the computed values at the interpolated nodes - cufftExecC2R(plan_idft, - reinterpret_cast(thrust::raw_pointer_cast(fft_w_coefficients.data())), + cufftExecC2R(plan_idft, + reinterpret_cast(thrust::raw_pointer_cast(fft_w_coefficients.data())), reinterpret_cast(thrust::raw_pointer_cast(fft_output.data()))); GpuErrorCheck(cudaDeviceSynchronize()); copy_from_fft_output<<>>( @@ -471,6 +478,7 @@ void n_body_fft_2d( n_terms ); GpuErrorCheck(cudaDeviceSynchronize()); + /* * Step 3: Compute the potentials \tilde{\phi} */ @@ -488,37 +496,3 @@ void n_body_fft_2d( ); GpuErrorCheck(cudaDeviceSynchronize()); } - -float* get_ntime() { - return _nbody_times; -} - - -void interpolate(int n_interpolation_points, int N, const float *y_in_box, const float *y_tilde_spacings, - float *interpolated_values, const float *denominator) { - // The denominators are the same across the interpolants, so we only need to compute them once - // auto *denominator = new float[n_interpolation_points]; - // for (int i = 0; i < n_interpolation_points; i++) { - // denominator[i] = 1; - // for (int j = 0; j < n_interpolation_points; j++) { - // if (i != j) { - // denominator[i] *= y_tilde_spacings[i] - y_tilde_spacings[j]; - // } - // } - // } - - // Compute the numerators and the interpolant value - for (int i = 0; i < N; i++) { - for (int j = 0; j < n_interpolation_points; j++) { - interpolated_values[j * N + i] = 1; - for (int k = 0; k < n_interpolation_points; k++) { - if (j != k) { - interpolated_values[j * N + i] *= y_in_box[i] - y_tilde_spacings[k]; - } - } - interpolated_values[j * N + i] /= denominator[j]; - } - } - - // delete[] denominator; -} From 2ecccbad04fca3a52a6de862666230b4c8bc3f05 Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 17:06:37 -0700 Subject: [PATCH 05/14] move nbodyfft to kernels, fix CufftSafeCall --- CMakeLists.txt | 2 +- src/fit_tsne.cu | 71 +++------------------ src/include/common.h | 5 +- src/include/fit_tsne.h | 1 + src/include/{ => kernels}/nbodyfft.h | 6 +- src/include/macros.h | 14 ----- src/include/transforms.h | 0 src/include/util/cuda_utils.h | 13 ++-- src/{ => kernels}/nbodyfft.cu | 56 +---------------- src/util/cuda_utils.cu | 93 +++++++++++++++++++++++++--- src/util/data_utils.cu | 6 +- 11 files changed, 112 insertions(+), 155 deletions(-) rename src/include/{ => kernels}/nbodyfft.h (95%) delete mode 100644 src/include/macros.h delete mode 100644 src/include/transforms.h rename src/{ => kernels}/nbodyfft.cu (92%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 62a94e2..c609763 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -152,11 +152,11 @@ set(SOURCES src/kernels/apply_forces.cu src/kernels/attr_forces.cu src/kernels/perplexity_search.cu + src/kernels/nbodyfft.cu # Method files src/ext/pymodule_ext.cu src/fit_tsne.cu - src/nbodyfft.cu ) diff --git a/src/fit_tsne.cu b/src/fit_tsne.cu index 28cffc2..6bec506 100644 --- a/src/fit_tsne.cu +++ b/src/fit_tsne.cu @@ -3,61 +3,12 @@ */ #include "include/fit_tsne.h" -#include "include/nbodyfft.h" #include -#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) - #define START_IL_TIMER() start = std::chrono::high_resolution_clock::now(); #define END_IL_TIMER(x) stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start); x += duration; total_time += duration; #define PRINT_IL_TIMER(x) std::cout << #x << ": " << ((float) x.count()) / 1000000.0 << "s" << std::endl -static const char *_cudaGetErrorEnum(cufftResult error) -{ - switch (error) - { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; - - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; - - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; - - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; - - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; - - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; - - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; - - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; - - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; - - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; - } - - return ""; -} - -inline void __cufftSafeCall(cufftResult err, const char *file, const int line) -{ - if( CUFFT_SUCCESS != err) { - fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ - _cudaGetErrorEnum(err)); \ - cudaDeviceReset(); assert(0); \ - } -} float squared_cauchy_2d(float x1, float x2, float y1, float y2) { return pow(1.0 + pow(x1 - y1, 2) + pow(x2 - y2, 2), -2); @@ -245,11 +196,7 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, cusparseSetMatIndexBase(sparse_matrix_descriptor,CUSPARSE_INDEX_BASE_ZERO); // Setup some return information if we're working on snapshots - int snap_interval; int snap_num = 0; - if (opt.return_style == tsnecuda::RETURN_STYLE::SNAPSHOT) { - snap_interval = opt.iterations / (opt.num_snapshots-1); - } // Get constants from options const int num_points = opt.num_points; @@ -461,18 +408,18 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, // Create the FFT Handles cufftHandle plan_kernel_tilde, plan_dft, plan_idft;; - cufftSafeCall(cufftCreate(&plan_kernel_tilde)); - cufftSafeCall(cufftCreate(&plan_dft)); - cufftSafeCall(cufftCreate(&plan_idft)); + CufftSafeCall(cufftCreate(&plan_kernel_tilde)); + CufftSafeCall(cufftCreate(&plan_dft)); + CufftSafeCall(cufftCreate(&plan_idft)); size_t work_size, work_size_dft, work_size_idft; int fft_dimensions[2] = {n_fft_coeffs, n_fft_coeffs}; - cufftSafeCall(cufftMakePlan2d(plan_kernel_tilde, fft_dimensions[0], fft_dimensions[1], CUFFT_R2C, &work_size)); - cufftSafeCall(cufftMakePlanMany(plan_dft, 2, fft_dimensions, + CufftSafeCall(cufftMakePlan2d(plan_kernel_tilde, fft_dimensions[0], fft_dimensions[1], CUFFT_R2C, &work_size)); + CufftSafeCall(cufftMakePlanMany(plan_dft, 2, fft_dimensions, NULL, 1, n_fft_coeffs * n_fft_coeffs, NULL, 1, n_fft_coeffs * (n_fft_coeffs / 2 + 1), CUFFT_R2C, n_terms, &work_size_dft)); - cufftSafeCall(cufftMakePlanMany(plan_idft, 2, fft_dimensions, + CufftSafeCall(cufftMakePlanMany(plan_idft, 2, fft_dimensions, NULL, 1, n_fft_coeffs * (n_fft_coeffs / 2 + 1), NULL, 1, n_fft_coeffs * n_fft_coeffs, CUFFT_C2R, n_terms, &work_size_idft)); @@ -690,9 +637,9 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, } - cufftSafeCall(cufftDestroy(plan_kernel_tilde)); - cufftSafeCall(cufftDestroy(plan_dft)); - cufftSafeCall(cufftDestroy(plan_idft)); + CufftSafeCall(cufftDestroy(plan_kernel_tilde)); + CufftSafeCall(cufftDestroy(plan_dft)); + CufftSafeCall(cufftDestroy(plan_idft)); if (opt.verbosity > 0) { PRINT_IL_TIMER(_time_initialization); diff --git a/src/include/common.h b/src/include/common.h index e7bbfa1..885a12a 100644 --- a/src/include/common.h +++ b/src/include/common.h @@ -1,6 +1,6 @@ /** * @brief Common includes for the Cuda-TSNE project - * + * * @file common.h * @author David Chan * @date 2018-04-04 @@ -57,7 +57,4 @@ #include #include -// Additional Macros -#include "macros.h" - #endif diff --git a/src/include/fit_tsne.h b/src/include/fit_tsne.h index 87387d9..61fe542 100644 --- a/src/include/fit_tsne.h +++ b/src/include/fit_tsne.h @@ -23,6 +23,7 @@ #include "include/kernels/apply_forces.h" #include "include/kernels/attr_forces.h" #include "include/kernels/perplexity_search.h" +#include "include/kernels/nbodyfft.h" namespace tsnecuda { void RunTsne(tsnecuda::Options &opt, tsnecuda::GpuOptions &gpu_opt); diff --git a/src/include/nbodyfft.h b/src/include/kernels/nbodyfft.h similarity index 95% rename from src/include/nbodyfft.h rename to src/include/kernels/nbodyfft.h index 01154f5..a5ab5cb 100644 --- a/src/include/nbodyfft.h +++ b/src/include/kernels/nbodyfft.h @@ -4,9 +4,9 @@ #include #include #include -#include "common.h" -#include "include/util/cuda_utils.h" -#include "include/util/matrix_broadcast_utils.h" +#include "../common.h" +#include "../util/cuda_utils.h" +#include "../util/matrix_broadcast_utils.h" namespace tsnecuda { diff --git a/src/include/macros.h b/src/include/macros.h deleted file mode 100644 index 32644d6..0000000 --- a/src/include/macros.h +++ /dev/null @@ -1,14 +0,0 @@ -/** - * @brief Definable macros for the program. Altering these may help with optimization - * - * @file macros.h - * @author David Chan - * @date 2018-04-04 - */ - -#ifndef MACROS_H -#define MACROS_H - -// Nothin here yet! - -#endif diff --git a/src/include/transforms.h b/src/include/transforms.h deleted file mode 100644 index e69de29..0000000 diff --git a/src/include/util/cuda_utils.h b/src/include/util/cuda_utils.h index af55001..111a729 100644 --- a/src/include/util/cuda_utils.h +++ b/src/include/util/cuda_utils.h @@ -1,17 +1,17 @@ /** * @brief Utility/Error Checking code. - * + * * @file cuda_utils.cu * @date 2018-04-04 - * Copyright (C) 2012-2017 Orange Owl Solutions. + * Copyright (C) 2012-2017 Orange Owl Solutions. */ /* CUDA Utilities - Utilities for high performance CPUs/C/C++ and GPUs/CUDA computing library. - - Copyright (C) 2012-2017 Orange Owl Solutions. - + Copyright (C) 2012-2017 Orange Owl Solutions. + + CUDA Utilities is free software: you can redistribute it and/or modify it under the terms of the Lesser GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or @@ -34,11 +34,12 @@ #ifndef CUDA_UTILITIES_CUH #define CUDA_UTILITIES_CUH -#include "include/common.h" +#include "../common.h" __host__ __device__ int iDivUp(int, int); extern "C" void CublasSafeCall(cublasStatus_t); extern "C" void CusparseSafeCall(cusparseStatus_t err); +extern "C" void CufftSafeCall(cufftResult err); extern "C" void GpuErrorCheck(cudaError_t ans); #endif diff --git a/src/nbodyfft.cu b/src/kernels/nbodyfft.cu similarity index 92% rename from src/nbodyfft.cu rename to src/kernels/nbodyfft.cu index 177116e..21adb08 100644 --- a/src/nbodyfft.cu +++ b/src/kernels/nbodyfft.cu @@ -1,58 +1,4 @@ -#include "include/nbodyfft.h" - -#define cufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__) - -clock_t _nbody_fft_timer; -float _nbody_times[10]; - -#define _ntime(x) _nbody_times[x] += ( (float) clock() - _nbody_fft_timer ) / CLOCKS_PER_SEC; _nbody_fft_timer = clock(); - -static const char *_cudaGetErrorEnum(cufftResult error) -{ - switch (error) - { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; - - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; - - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; - - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; - - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; - - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; - - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; - - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; - - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; - - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; - } - - return ""; -} - -inline void __cufftSafeCall(cufftResult err, const char *file, const int line) -{ - if( CUFFT_SUCCESS != err) { - fprintf(stderr, "CUFFT error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ - _cudaGetErrorEnum(err)); \ - cudaDeviceReset(); assert(0); \ - } -} +#include "../include/kernels/nbodyfft.h" __global__ void copy_to_fft_input(volatile float * __restrict__ fft_input, const float * w_coefficients_device, diff --git a/src/util/cuda_utils.cu b/src/util/cuda_utils.cu index 14d194a..78edaf4 100644 --- a/src/util/cuda_utils.cu +++ b/src/util/cuda_utils.cu @@ -1,18 +1,18 @@ /** * @brief Utility/Error Checking code. - * + * * @file cuda_utils.cu * @date 2018-04-04 - * Copyright (C) 2012-2017 Orange Owl Solutions. + * Copyright (C) 2012-2017 Orange Owl Solutions. */ /* CUDA Utilities - Utilities for high performance CPUs/C/C++ and GPUs/CUDA computing library. - - Copyright (C) 2012-2017 Orange Owl Solutions. - + Copyright (C) 2012-2017 Orange Owl Solutions. + + CUDA Utilities is free software: you can redistribute it and/or modify it under the terms of the Lesser GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or @@ -31,7 +31,7 @@ or send an e-mail to: info@orangeowlsolutions.com */ -#include "include/util/cuda_utils.h" +#include "../include/util/cuda_utils.h" /*******************/ /* iDivUp FUNCTION */ @@ -159,5 +159,84 @@ inline void __CusparseSafeCall(cusparseStatus_t err, const char *file, const int extern "C" void CusparseSafeCall(cusparseStatus_t err) { __CusparseSafeCall(err, __FILE__, __LINE__); } +/************************/ +/* CUFFT ERROR CHECKING */ +/************************/ + +static const char *_cufftGetErrorEnum(cufftResult error) +{ + switch (error) + { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + + default: + return ""; + } +} + +inline void __cufftSafeCall(cufftResult err, const char *file, const int line) +{ + if( CUFFT_SUCCESS != err) { + fprintf(stderr, + "CUFFT error in file '%s', line %d, error %s\nterminating!\n", + __FILE__, + __LINE__, + _cufftGetErrorEnum(err)); + cudaDeviceReset(); + assert(0); + } +} + +extern "C" void CufftSafeCall(cufftResult err) { __cufftSafeCall(err, __FILE__, __LINE__); } + -/// END OF CUDA UTILITIES \ No newline at end of file +/// END OF CUDA UTILITIES diff --git a/src/util/data_utils.cu b/src/util/data_utils.cu index 19fb93a..16d2fc7 100644 --- a/src/util/data_utils.cu +++ b/src/util/data_utils.cu @@ -1,13 +1,13 @@ /** * @brief Implementation file of the data_utils.h header - * + * * @file data_utils.cu * @author your name * @date 2018-05-05 * Copyright (c) 2018, Regents of the University of Californias */ -#include "include/util/data_utils.h" +#include "../include/util/data_utils.h" float* tsnecuda::util::LoadMnist(std::string file_name, int32_t& num_images, int32_t& num_rows, int32_t& num_columns) { @@ -92,7 +92,7 @@ float* tsnecuda::util::LoadCifar10(std::string file_path) { for (size_t i = 0; i < 5; i++) { char binary_file_name[50]; snprintf(binary_file_name, sizeof(binary_file_name), - "/data_batch_%d.bin", i + 1); + "/data_batch_%zu.bin", i + 1); std::string file_name = file_path; file_name.append(binary_file_name); From 2a6402efac866d950d8fab146c6b32bbbc6e40d8 Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 17:24:59 -0700 Subject: [PATCH 06/14] create rep_forces.cu file --- CMakeLists.txt | 1 + src/fit_tsne.cu | 106 ++---------------------------- src/include/fit_tsne.h | 1 + src/include/kernels/attr_forces.h | 4 +- src/include/kernels/rep_forces.h | 27 ++++++++ src/kernels/rep_forces.cu | 94 ++++++++++++++++++++++++++ 6 files changed, 129 insertions(+), 104 deletions(-) create mode 100644 src/include/kernels/rep_forces.h create mode 100644 src/kernels/rep_forces.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index c609763..e91cf0e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,6 +151,7 @@ set(SOURCES # Kernels src/kernels/apply_forces.cu src/kernels/attr_forces.cu + src/kernels/rep_forces.cu src/kernels/perplexity_search.cu src/kernels/nbodyfft.cu diff --git a/src/fit_tsne.cu b/src/fit_tsne.cu index 6bec506..472905e 100644 --- a/src/fit_tsne.cu +++ b/src/fit_tsne.cu @@ -9,7 +9,6 @@ #define END_IL_TIMER(x) stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start); x += duration; total_time += duration; #define PRINT_IL_TIMER(x) std::cout << #x << ": " << ((float) x.count()) / 1000000.0 << "s" << std::endl - float squared_cauchy_2d(float x1, float x2, float y1, float y2) { return pow(1.0 + pow(x1 - y1, 2) + pow(x2 - y2, 2), -2); } @@ -59,98 +58,6 @@ struct minmax_binary_op } }; -__global__ void compute_repulsive_forces_kernel( - volatile float * __restrict__ repulsive_forces_device, - volatile float * __restrict__ normalization_vec_device, - const float * const xs, - const float * const ys, - const float * const potentialsQij, - const int num_points, - const int num_nodes, - const int n_terms) -{ - register int TID = threadIdx.x + blockIdx.x * blockDim.x; - if (TID >= num_points) - return; - - register float phi1, phi2, phi3, phi4, x_pt, y_pt; - - phi1 = potentialsQij[TID * n_terms + 0]; - phi2 = potentialsQij[TID * n_terms + 1]; - phi3 = potentialsQij[TID * n_terms + 2]; - phi4 = potentialsQij[TID * n_terms + 3]; - - x_pt = xs[TID]; - y_pt = ys[TID]; - - normalization_vec_device[TID] = - (1 + x_pt * x_pt + y_pt * y_pt) * phi1 - 2 * (x_pt * phi2 + y_pt * phi3) + phi4; - - repulsive_forces_device[TID] = x_pt * phi1 - phi2; - repulsive_forces_device[TID + num_nodes + 1] = y_pt * phi1 - phi3; -} - -float compute_repulsive_forces( - thrust::device_vector &repulsive_forces_device, - thrust::device_vector &normalization_vec_device, - thrust::device_vector &points_device, - thrust::device_vector &potentialsQij, - const int num_points, - const int num_nodes, - const int n_terms) -{ - const int num_threads = 1024; - const int num_blocks = (num_points + num_threads - 1) / num_threads; - compute_repulsive_forces_kernel<<>>( - thrust::raw_pointer_cast(repulsive_forces_device.data()), - thrust::raw_pointer_cast(normalization_vec_device.data()), - thrust::raw_pointer_cast(points_device.data()), - thrust::raw_pointer_cast(points_device.data() + num_nodes + 1), - thrust::raw_pointer_cast(potentialsQij.data()), - num_points, num_nodes, n_terms); - float sumQ = thrust::reduce( - normalization_vec_device.begin(), normalization_vec_device.end(), 0, - thrust::plus()); - return sumQ - num_points; -} - -__global__ void compute_chargesQij_kernel( - volatile float * __restrict__ chargesQij, - const float * const xs, - const float * const ys, - const int num_points, - const int n_terms) -{ - register int TID = threadIdx.x + blockIdx.x * blockDim.x; - if (TID >= num_points) - return; - - register float x_pt, y_pt; - x_pt = xs[TID]; - y_pt = ys[TID]; - - chargesQij[TID * n_terms + 0] = 1; - chargesQij[TID * n_terms + 1] = x_pt; - chargesQij[TID * n_terms + 2] = y_pt; - chargesQij[TID * n_terms + 3] = x_pt * x_pt + y_pt * y_pt; -} - -void compute_chargesQij( - thrust::device_vector &chargesQij, - thrust::device_vector &points_device, - const int num_points, - const int num_nodes, - const int n_terms) -{ - const int num_threads = 1024; - const int num_blocks = (num_points + num_threads - 1) / num_threads; - compute_chargesQij_kernel<<>>( - thrust::raw_pointer_cast(chargesQij.data()), - thrust::raw_pointer_cast(points_device.data()), - thrust::raw_pointer_cast(points_device.data() + num_nodes + 1), - num_points, n_terms); -} - void tsnecuda::RunTsne(tsnecuda::Options &opt, tsnecuda::GpuOptions &gpu_opt) { @@ -495,7 +402,7 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, // Prepare the terms that we'll use to compute the sum i.e. the repulsive forces START_IL_TIMER(); - compute_chargesQij(chargesQij_device, points_device, num_points, num_points - 1, n_terms); + tsnecuda::ComputeChargesQij(chargesQij_device, points_device, num_points, num_points - 1, n_terms); END_IL_TIMER(_time_compute_charges); // Compute Minimax elements @@ -538,7 +445,7 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, // different ones. We subtract N at the end because the following sums over all i, j, whereas Z contains i \neq j // Make the negative term, or F_rep in the equation 3 of the paper - normalization = compute_repulsive_forces( + normalization = tsnecuda::ComputeRepulsiveForces( repulsive_forces_device, normalization_vec_device, points_device, potentialsQij_device, num_points, num_points - 1, n_terms); @@ -578,15 +485,12 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, num_points - 1, num_points, num_blocks); - // Add a bit of random motion to prevent points from being on top of each other - // thrust::transform(points_device.begin(), points_device.end(), random_vector_device.begin(), - // points_device.begin(), thrust::plus()); // // Compute the gradient norm // tsnecuda::util::SquareDeviceVector(attractive_forces_device, old_forces_device); // thrust::transform(attractive_forces_device.begin(), attractive_forces_device.begin()+num_points, - // attractive_forces_device.begin()+num_points, attractive_forces_device.begin(), - // thrust::plus()); + // attractive_forces_device.begin()+num_points, attractive_forces_device.begin(), + // thrust::plus()); // tsnecuda::util::SqrtDeviceVector(attractive_forces_device, attractive_forces_device); // float grad_norm = thrust::reduce( // attractive_forces_device.begin(), attractive_forces_device.begin() + num_points, @@ -655,8 +559,6 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, PRINT_IL_TIMER(_time_apply_forces); PRINT_IL_TIMER(_time_other); PRINT_IL_TIMER(total_time); - - // std::cout << "Total Time: " << ((float) total_time.count()) / 1000000.0 << std::endl; } diff --git a/src/include/fit_tsne.h b/src/include/fit_tsne.h index 61fe542..5eeea42 100644 --- a/src/include/fit_tsne.h +++ b/src/include/fit_tsne.h @@ -24,6 +24,7 @@ #include "include/kernels/attr_forces.h" #include "include/kernels/perplexity_search.h" #include "include/kernels/nbodyfft.h" +#include "include/kernels/rep_forces.h" namespace tsnecuda { void RunTsne(tsnecuda::Options &opt, tsnecuda::GpuOptions &gpu_opt); diff --git a/src/include/kernels/attr_forces.h b/src/include/kernels/attr_forces.h index 59e3404..94f8c29 100644 --- a/src/include/kernels/attr_forces.h +++ b/src/include/kernels/attr_forces.h @@ -6,8 +6,8 @@ * @date 2018-05-08 * Copyright (c) 2018, Regents of the University of California */ -#ifndef SRC_INCLUDE_KERNELS_BH_ATTR_FORCES_H_ -#define SRC_INCLUDE_KERNELS_BH_ATTR_FORCES_H_ +#ifndef SRC_INCLUDE_KERNELS_ATTR_FORCES_H_ +#define SRC_INCLUDE_KERNELS_ATTR_FORCES_H_ #include "../common.h" #include "../options.h" diff --git a/src/include/kernels/rep_forces.h b/src/include/kernels/rep_forces.h new file mode 100644 index 0000000..5c0e61e --- /dev/null +++ b/src/include/kernels/rep_forces.h @@ -0,0 +1,27 @@ +#ifndef SRC_INCLUDE_KERNELS_REP_FORCES_H_ +#define SRC_INCLUDE_KERNELS_REP_FORCES_H_ + +#include "../common.h" +#include "../options.h" +#include "../util/cuda_utils.h" + +namespace tsnecuda { + +float ComputeRepulsiveForces( + thrust::device_vector &repulsive_forces_device, + thrust::device_vector &normalization_vec_device, + thrust::device_vector &points_device, + thrust::device_vector &potentialsQij, + const int num_points, + const int num_nodes, + const int n_terms); + +void ComputeChargesQij( + thrust::device_vector &chargesQij, + thrust::device_vector &points_device, + const int num_points, + const int num_nodes, + const int n_terms); +} + +#endif diff --git a/src/kernels/rep_forces.cu b/src/kernels/rep_forces.cu new file mode 100644 index 0000000..6accb35 --- /dev/null +++ b/src/kernels/rep_forces.cu @@ -0,0 +1,94 @@ +#include "../include/kernels/rep_forces.h" + + +__global__ void compute_repulsive_forces_kernel( + volatile float * __restrict__ repulsive_forces_device, + volatile float * __restrict__ normalization_vec_device, + const float * const xs, + const float * const ys, + const float * const potentialsQij, + const int num_points, + const int num_nodes, + const int n_terms) +{ + register int TID = threadIdx.x + blockIdx.x * blockDim.x; + if (TID >= num_points) + return; + + register float phi1, phi2, phi3, phi4, x_pt, y_pt; + + phi1 = potentialsQij[TID * n_terms + 0]; + phi2 = potentialsQij[TID * n_terms + 1]; + phi3 = potentialsQij[TID * n_terms + 2]; + phi4 = potentialsQij[TID * n_terms + 3]; + + x_pt = xs[TID]; + y_pt = ys[TID]; + + normalization_vec_device[TID] = + (1 + x_pt * x_pt + y_pt * y_pt) * phi1 - 2 * (x_pt * phi2 + y_pt * phi3) + phi4; + + repulsive_forces_device[TID] = x_pt * phi1 - phi2; + repulsive_forces_device[TID + num_nodes + 1] = y_pt * phi1 - phi3; +} + +float tsnecuda::ComputeRepulsiveForces( + thrust::device_vector &repulsive_forces_device, + thrust::device_vector &normalization_vec_device, + thrust::device_vector &points_device, + thrust::device_vector &potentialsQij, + const int num_points, + const int num_nodes, + const int n_terms) +{ + const int num_threads = 1024; + const int num_blocks = (num_points + num_threads - 1) / num_threads; + compute_repulsive_forces_kernel<<>>( + thrust::raw_pointer_cast(repulsive_forces_device.data()), + thrust::raw_pointer_cast(normalization_vec_device.data()), + thrust::raw_pointer_cast(points_device.data()), + thrust::raw_pointer_cast(points_device.data() + num_nodes + 1), + thrust::raw_pointer_cast(potentialsQij.data()), + num_points, num_nodes, n_terms); + float sumQ = thrust::reduce( + normalization_vec_device.begin(), normalization_vec_device.end(), 0, + thrust::plus()); + return sumQ - num_points; +} + +__global__ void compute_chargesQij_kernel( + volatile float * __restrict__ chargesQij, + const float * const xs, + const float * const ys, + const int num_points, + const int n_terms) +{ + register int TID = threadIdx.x + blockIdx.x * blockDim.x; + if (TID >= num_points) + return; + + register float x_pt, y_pt; + x_pt = xs[TID]; + y_pt = ys[TID]; + + chargesQij[TID * n_terms + 0] = 1; + chargesQij[TID * n_terms + 1] = x_pt; + chargesQij[TID * n_terms + 2] = y_pt; + chargesQij[TID * n_terms + 3] = x_pt * x_pt + y_pt * y_pt; +} + +void tsnecuda::ComputeChargesQij( + thrust::device_vector &chargesQij, + thrust::device_vector &points_device, + const int num_points, + const int num_nodes, + const int n_terms) +{ + const int num_threads = 1024; + const int num_blocks = (num_points + num_threads - 1) / num_threads; + compute_chargesQij_kernel<<>>( + thrust::raw_pointer_cast(chargesQij.data()), + thrust::raw_pointer_cast(points_device.data()), + thrust::raw_pointer_cast(points_device.data() + num_nodes + 1), + num_points, n_terms); +} From 1c0d8fbf2e17b207880a8a2963e0c2efb98ec5b3 Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 17:27:04 -0700 Subject: [PATCH 07/14] delete unused minmax --- src/fit_tsne.cu | 66 ++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/src/fit_tsne.cu b/src/fit_tsne.cu index 472905e..921edcb 100644 --- a/src/fit_tsne.cu +++ b/src/fit_tsne.cu @@ -16,48 +16,48 @@ float squared_cauchy_2d(float x1, float x2, float y1, float y2) { // minmax_pair stores the minimum and maximum // values that have been encountered so far -template -struct minmax_pair -{ - T min_val; - T max_val; -}; +// template +// struct minmax_pair +// { + // T min_val; + // T max_val; +// }; // minmax_unary_op is a functor that takes in a value x and // returns a minmax_pair whose minimum and maximum values // are initialized to x. -template -struct minmax_unary_op - : public thrust::unary_function< T, minmax_pair > -{ - __host__ __device__ - minmax_pair operator()(const T& x) const - { - minmax_pair result; - result.min_val = x; - result.max_val = x; - return result; - } -}; +// template +// struct minmax_unary_op + // : public thrust::unary_function< T, minmax_pair > +// { + // __host__ __device__ + // minmax_pair operator()(const T& x) const + // { + // minmax_pair result; + // result.min_val = x; + // result.max_val = x; + // return result; + // } +// }; // minmax_binary_op is a functor that accepts two minmax_pair // structs and returns a new minmax_pair whose minimum and // maximum values are the min() and max() respectively of // the minimums and maximums of the input pairs -template -struct minmax_binary_op - : public thrust::binary_function< minmax_pair, minmax_pair, minmax_pair > -{ - __host__ __device__ - minmax_pair operator()(const minmax_pair& x, const minmax_pair& y) const - { - minmax_pair result; - result.min_val = thrust::min(x.min_val, y.min_val); - result.max_val = thrust::max(x.max_val, y.max_val); - return result; - } -}; - +// template +// struct minmax_binary_op + // : public thrust::binary_function< minmax_pair, minmax_pair, minmax_pair > +// { + // __host__ __device__ + // minmax_pair operator()(const minmax_pair& x, const minmax_pair& y) const + // { + // minmax_pair result; + // result.min_val = thrust::min(x.min_val, y.min_val); + // result.max_val = thrust::max(x.max_val, y.max_val); + // return result; + // } +// }; +// void tsnecuda::RunTsne(tsnecuda::Options &opt, tsnecuda::GpuOptions &gpu_opt) { From 86b2695ada0d1202734dd682d07422e9fb266193 Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 17:27:41 -0700 Subject: [PATCH 08/14] delete squared_cauchy_2d - this is on GPU now --- src/fit_tsne.cu | 49 ------------------------------------------------- 1 file changed, 49 deletions(-) diff --git a/src/fit_tsne.cu b/src/fit_tsne.cu index 921edcb..f89845f 100644 --- a/src/fit_tsne.cu +++ b/src/fit_tsne.cu @@ -9,55 +9,6 @@ #define END_IL_TIMER(x) stop = std::chrono::high_resolution_clock::now(); duration = std::chrono::duration_cast(stop - start); x += duration; total_time += duration; #define PRINT_IL_TIMER(x) std::cout << #x << ": " << ((float) x.count()) / 1000000.0 << "s" << std::endl -float squared_cauchy_2d(float x1, float x2, float y1, float y2) { - return pow(1.0 + pow(x1 - y1, 2) + pow(x2 - y2, 2), -2); -} -// compute minimum and maximum values in a single reduction - -// minmax_pair stores the minimum and maximum -// values that have been encountered so far -// template -// struct minmax_pair -// { - // T min_val; - // T max_val; -// }; - -// minmax_unary_op is a functor that takes in a value x and -// returns a minmax_pair whose minimum and maximum values -// are initialized to x. -// template -// struct minmax_unary_op - // : public thrust::unary_function< T, minmax_pair > -// { - // __host__ __device__ - // minmax_pair operator()(const T& x) const - // { - // minmax_pair result; - // result.min_val = x; - // result.max_val = x; - // return result; - // } -// }; - -// minmax_binary_op is a functor that accepts two minmax_pair -// structs and returns a new minmax_pair whose minimum and -// maximum values are the min() and max() respectively of -// the minimums and maximums of the input pairs -// template -// struct minmax_binary_op - // : public thrust::binary_function< minmax_pair, minmax_pair, minmax_pair > -// { - // __host__ __device__ - // minmax_pair operator()(const minmax_pair& x, const minmax_pair& y) const - // { - // minmax_pair result; - // result.min_val = thrust::min(x.min_val, y.min_val); - // result.max_val = thrust::max(x.max_val, y.max_val); - // return result; - // } -// }; -// void tsnecuda::RunTsne(tsnecuda::Options &opt, tsnecuda::GpuOptions &gpu_opt) { From 092fe6b771c45ce90aab740ee7e6065374755678 Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 17:36:00 -0700 Subject: [PATCH 09/14] re-add support for snapshotting --- src/fit_tsne.cu | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/src/fit_tsne.cu b/src/fit_tsne.cu index f89845f..52f8284 100644 --- a/src/fit_tsne.cu +++ b/src/fit_tsne.cu @@ -55,6 +55,10 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, // Setup some return information if we're working on snapshots int snap_num = 0; + int snap_interval = 1; + if (opt.return_style == tsnecuda::RETURN_STYLE::SNAPSHOT) { + snap_interval = opt.iterations / (opt.num_snapshots - 1); + } // Get constants from options const int num_points = opt.num_points; @@ -68,23 +72,16 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, float attr_exaggeration = opt.early_exaggeration; float normalization; - // Figure out number of nodes needed for BH tree - int nnodes = num_points * 2; - if (nnodes < 1024 * num_blocks) nnodes = 1024 * num_blocks; - while ((nnodes & (gpu_opt.warp_size - 1)) != 0) - nnodes++; - nnodes--; - const int num_nodes = nnodes; - opt.num_nodes = num_nodes; - // Allocate host memory float *knn_squared_distances = new float[num_points * num_neighbors]; memset(knn_squared_distances, 0, num_points * num_neighbors * sizeof(float)); long *knn_indices = new long[num_points * num_neighbors]; - // Initialize global variables - thrust::device_vector err_device(1); - // tsnecuda::Initialize(gpu_opt, err_device); + // Set cache configs + cudaFuncSetCacheConfig(tsnecuda::IntegrationKernel, cudaFuncCachePreferL1); + cudaFuncSetCacheConfig(tsnecuda::ComputePijxQijKernel, cudaFuncCachePreferShared); + GpuErrorCheck(cudaDeviceSynchronize()); + END_IL_TIMER(_time_initialization); START_IL_TIMER(); @@ -330,11 +327,7 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, std::cout << "This version is not built with ZMQ for interative viz. Rebuild with WITH_ZMQ=TRUE for viz." << std::endl; #endif - - END_IL_TIMER(_time_init_fft); - - // Support for infinite iteration for (size_t step = 0; step != opt.iterations; step++) { @@ -483,12 +476,12 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, } // // Handle snapshoting - // if (opt.return_style == tsnecuda::RETURN_STYLE::SNAPSHOT && step % snap_interval == 0 && opt.return_data != nullptr) { - // thrust::copy(points_device.begin(), - // points_device.end(), - // snap_num*opt.num_points*2 + opt.return_data); - // snap_num += 1; - // } + if (opt.return_style == tsnecuda::RETURN_STYLE::SNAPSHOT && step % snap_interval == 0 && opt.return_data != nullptr) { + thrust::copy(points_device.begin(), + points_device.end(), + snap_num*opt.num_points*2 + opt.return_data); + snap_num += 1; + } } From f1d7ee766032b71e9d58a4de650a39e3d9e9f657 Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 17:40:48 -0700 Subject: [PATCH 10/14] re-add support for gradient norms --- src/fit_tsne.cu | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/src/fit_tsne.cu b/src/fit_tsne.cu index 52f8284..1af3ef0 100644 --- a/src/fit_tsne.cu +++ b/src/fit_tsne.cu @@ -384,10 +384,6 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, END_IL_TIMER(_time_nbodyfft); START_IL_TIMER(); - // Compute the normalization constant Z or sum of q_{ij}. This expression is different from the one in the original - // paper, but equivalent. This is done so we need only use a single kernel (K_2 in the paper) instead of two - // different ones. We subtract N at the end because the following sums over all i, j, whereas Z contains i \neq j - // Make the negative term, or F_rep in the equation 3 of the paper normalization = tsnecuda::ComputeRepulsiveForces( repulsive_forces_device, normalization_vec_device, points_device, @@ -431,24 +427,24 @@ void tsnecuda::RunTsne(tsnecuda::Options &opt, num_blocks); // // Compute the gradient norm - // tsnecuda::util::SquareDeviceVector(attractive_forces_device, old_forces_device); - // thrust::transform(attractive_forces_device.begin(), attractive_forces_device.begin()+num_points, - // attractive_forces_device.begin()+num_points, attractive_forces_device.begin(), - // thrust::plus()); - // tsnecuda::util::SqrtDeviceVector(attractive_forces_device, attractive_forces_device); - // float grad_norm = thrust::reduce( - // attractive_forces_device.begin(), attractive_forces_device.begin() + num_points, - // 0.0f, thrust::plus()) / num_points; - // thrust::fill(attractive_forces_device.begin(), attractive_forces_device.end(), 0.0f); - - // if (grad_norm < opt.min_gradient_norm) { - // if (opt.verbosity >= 1) std::cout << "Reached minimum gradient norm: " << grad_norm << std::endl; - // break; - // } - - // if (opt.verbosity >= 1 && step % opt.print_interval == 0) { - // std::cout << "[Step " << step << "] Avg. Gradient Norm: " << grad_norm << std::endl; - // } + tsnecuda::util::SquareDeviceVector(attractive_forces_device, old_forces_device); + thrust::transform(attractive_forces_device.begin(), attractive_forces_device.begin()+num_points, + attractive_forces_device.begin()+num_points, attractive_forces_device.begin(), + thrust::plus()); + tsnecuda::util::SqrtDeviceVector(attractive_forces_device, attractive_forces_device); + float grad_norm = thrust::reduce( + attractive_forces_device.begin(), attractive_forces_device.begin() + num_points, + 0.0f, thrust::plus()) / num_points; + thrust::fill(attractive_forces_device.begin(), attractive_forces_device.end(), 0.0f); + + if (grad_norm < opt.min_gradient_norm) { + if (opt.verbosity >= 1) std::cout << "Reached minimum gradient norm: " << grad_norm << std::endl; + break; + } + + if (opt.verbosity >= 1 && step % opt.print_interval == 0) { + std::cout << "[Step " << step << "] Avg. Gradient Norm: " << grad_norm << std::endl; + } END_IL_TIMER(_time_apply_forces); From c98a9f7d1f19b2a6dbaebf5e025788d92e929589 Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 17:45:38 -0700 Subject: [PATCH 11/14] re-add compute architectures --- CMakeLists.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e91cf0e..16bd30e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,13 +58,13 @@ SET(CUDA_PROPAGATE_HOST_FLAGS OFF) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -O3 -Xptxas -dlcm=cg - # -gencode=arch=compute_30,code=sm_30 - # -gencode=arch=compute_35,code=sm_35 - # -gencode=arch=compute_50,code=sm_50 - # -gencode=arch=compute_52,code=sm_52 - # -gencode=arch=compute_60,code=sm_60 - -gencode=arch=compute_61,code=sm_61 - # -gencode=arch=compute_70,code=sm_70 + -gencode=arch=compute_30,code=sm_30 + -gencode=arch=compute_35,code=sm_35 + -gencode=arch=compute_50,code=sm_50 + -gencode=arch=compute_52,code=sm_52 + -gencode=arch=compute_60,code=sm_60 + -gencode=arch=compute_61,code=sm_61 + -gencode=arch=compute_70,code=sm_70 -std=c++11 -Xcompiler '-O3' -Xcompiler '-fPIC' From 9da214f28f038c2ca696f402764dcfddefdfebd5 Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 18:16:09 -0700 Subject: [PATCH 12/14] fix relative imports --- CMakeLists.txt | 12 ++++++------ build.sh | 7 +++---- meta.yaml | 4 ++-- src/include/kernels/apply_forces.h | 6 +++--- src/include/kernels/attr_forces.h | 6 +++--- src/include/kernels/nbodyfft.h | 6 +++--- src/include/kernels/perplexity_search.h | 12 ++++++------ src/include/kernels/rep_forces.h | 6 +++--- src/include/util/cuda_utils.h | 2 +- src/kernels/apply_forces.cu | 2 +- src/kernels/initialization.cu | 13 ------------- src/kernels/nbodyfft.cu | 2 +- src/kernels/rep_forces.cu | 2 +- src/util/data_utils.cu | 2 +- 14 files changed, 34 insertions(+), 48 deletions(-) delete mode 100644 src/kernels/initialization.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 16bd30e..20e04d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,13 +58,13 @@ SET(CUDA_PROPAGATE_HOST_FLAGS OFF) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -O3 -Xptxas -dlcm=cg - -gencode=arch=compute_30,code=sm_30 - -gencode=arch=compute_35,code=sm_35 - -gencode=arch=compute_50,code=sm_50 - -gencode=arch=compute_52,code=sm_52 - -gencode=arch=compute_60,code=sm_60 + # -gencode=arch=compute_30,code=sm_30 + # -gencode=arch=compute_35,code=sm_35 + # -gencode=arch=compute_50,code=sm_50 + # -gencode=arch=compute_52,code=sm_52 + # -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 - -gencode=arch=compute_70,code=sm_70 + # -gencode=arch=compute_70,code=sm_70 -std=c++11 -Xcompiler '-O3' -Xcompiler '-fPIC' diff --git a/build.sh b/build.sh index b4e85cd..b930e39 100755 --- a/build.sh +++ b/build.sh @@ -1,11 +1,10 @@ -export PATH=/usr/local/cuda-9.0/bin:$PATH -export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH +# export PATH=/usr/local/cuda-9.0/bin:$PATH +# export LD_LIBRARY_PATH=/usr/local/cuda-9.0/lib64:$LD_LIBRARY_PATH git submodule init -git submodule update +git submodule update cd ./build cmake .. -DBUILD_PYTHON=TRUE -DWITH_MKL=FALSE -DCMAKE_C_COMPILER=gcc-4.9 -DCMAKE_CXX_COMPILER=g++-4.9 pwd -make -j10 make cd python/ pwd diff --git a/meta.yaml b/meta.yaml index 3b9437c..3180603 100644 --- a/meta.yaml +++ b/meta.yaml @@ -1,5 +1,5 @@ {% set name = "tsnecuda" %} -{% set version = "0.1.1" %} +{% set version = "0.2.1" %} package: name: '{{ name|lower }}' @@ -8,7 +8,7 @@ package: source: git_url: https://github.com/CannyLab/tsne-cuda.git git_rev: HEAD - + requirements: build: - python diff --git a/src/include/kernels/apply_forces.h b/src/include/kernels/apply_forces.h index 9aeb815..f755663 100644 --- a/src/include/kernels/apply_forces.h +++ b/src/include/kernels/apply_forces.h @@ -10,9 +10,9 @@ #ifndef SRC_INCLUDE_KERNELS_APPLY_FORCES_H_ #define SRC_INCLUDE_KERNELS_APPLY_FORCES_H_ -#include "../common.h" -#include "../options.h" -#include "../util/cuda_utils.h" +#include "common.h" +#include "options.h" +#include "util/cuda_utils.h" namespace tsnecuda { __global__ diff --git a/src/include/kernels/attr_forces.h b/src/include/kernels/attr_forces.h index 94f8c29..82e6116 100644 --- a/src/include/kernels/attr_forces.h +++ b/src/include/kernels/attr_forces.h @@ -9,9 +9,9 @@ #ifndef SRC_INCLUDE_KERNELS_ATTR_FORCES_H_ #define SRC_INCLUDE_KERNELS_ATTR_FORCES_H_ -#include "../common.h" -#include "../options.h" -#include "../util/cuda_utils.h" +#include "common.h" +#include "options.h" +#include "util/cuda_utils.h" namespace tsnecuda { diff --git a/src/include/kernels/nbodyfft.h b/src/include/kernels/nbodyfft.h index a5ab5cb..73bcafb 100644 --- a/src/include/kernels/nbodyfft.h +++ b/src/include/kernels/nbodyfft.h @@ -4,9 +4,9 @@ #include #include #include -#include "../common.h" -#include "../util/cuda_utils.h" -#include "../util/matrix_broadcast_utils.h" +#include "common.h" +#include "util/cuda_utils.h" +#include "util/matrix_broadcast_utils.h" namespace tsnecuda { diff --git a/src/include/kernels/perplexity_search.h b/src/include/kernels/perplexity_search.h index 94d76e0..2ea7f1a 100644 --- a/src/include/kernels/perplexity_search.h +++ b/src/include/kernels/perplexity_search.h @@ -9,12 +9,12 @@ #ifndef SRC_INCLUDE_KERNELS_PERPLEXITY_SEARCH_H_ #define SRC_INCLUDE_KERNELS_PERPLEXITY_SEARCH_H_ -#include "../common.h" -#include "../options.h" -#include "../util/cuda_utils.h" -#include "../util/reduce_utils.h" -#include "../util/matrix_broadcast_utils.h" -#include "../util/thrust_transform_functions.h" +#include "common.h" +#include "options.h" +#include "util/cuda_utils.h" +#include "util/reduce_utils.h" +#include "util/matrix_broadcast_utils.h" +#include "util/thrust_transform_functions.h" namespace tsnecuda { __global__ diff --git a/src/include/kernels/rep_forces.h b/src/include/kernels/rep_forces.h index 5c0e61e..8c10cc6 100644 --- a/src/include/kernels/rep_forces.h +++ b/src/include/kernels/rep_forces.h @@ -1,9 +1,9 @@ #ifndef SRC_INCLUDE_KERNELS_REP_FORCES_H_ #define SRC_INCLUDE_KERNELS_REP_FORCES_H_ -#include "../common.h" -#include "../options.h" -#include "../util/cuda_utils.h" +#include "common.h" +#include "options.h" +#include "util/cuda_utils.h" namespace tsnecuda { diff --git a/src/include/util/cuda_utils.h b/src/include/util/cuda_utils.h index 111a729..f535969 100644 --- a/src/include/util/cuda_utils.h +++ b/src/include/util/cuda_utils.h @@ -34,7 +34,7 @@ #ifndef CUDA_UTILITIES_CUH #define CUDA_UTILITIES_CUH -#include "../common.h" +#include "common.h" __host__ __device__ int iDivUp(int, int); extern "C" void CublasSafeCall(cublasStatus_t); diff --git a/src/kernels/apply_forces.cu b/src/kernels/apply_forces.cu index 0160993..cbd295a 100644 --- a/src/kernels/apply_forces.cu +++ b/src/kernels/apply_forces.cu @@ -2,7 +2,7 @@ Apply forces to the points with momentum, exaggeration, etc. */ -#include "../include/kernels/apply_forces.h" +#include "include/kernels/apply_forces.h" /******************************************************************************/ diff --git a/src/kernels/initialization.cu b/src/kernels/initialization.cu deleted file mode 100644 index b7d4259..0000000 --- a/src/kernels/initialization.cu +++ /dev/null @@ -1,13 +0,0 @@ -#include "../include/kernels/initialization.h" - -/******************************************************************************/ -/*** initialize memory ********************************************************/ -/******************************************************************************/ - - -void tsnecuda::Initialize(tsnecuda::GpuOptions &gpu_opt, thrust::device_vector &errd) -{ - cudaFuncSetCacheConfig(tsnecuda::bh::IntegrationKernel, cudaFuncCachePreferL1); - cudaFuncSetCacheConfig(tsnecuda::bh::ComputePijxQijKernel, cudaFuncCachePreferShared); - GpuErrorCheck(cudaDeviceSynchronize()); -} diff --git a/src/kernels/nbodyfft.cu b/src/kernels/nbodyfft.cu index 21adb08..5474517 100644 --- a/src/kernels/nbodyfft.cu +++ b/src/kernels/nbodyfft.cu @@ -1,4 +1,4 @@ -#include "../include/kernels/nbodyfft.h" +#include "include/kernels/nbodyfft.h" __global__ void copy_to_fft_input(volatile float * __restrict__ fft_input, const float * w_coefficients_device, diff --git a/src/kernels/rep_forces.cu b/src/kernels/rep_forces.cu index 6accb35..7878db7 100644 --- a/src/kernels/rep_forces.cu +++ b/src/kernels/rep_forces.cu @@ -1,4 +1,4 @@ -#include "../include/kernels/rep_forces.h" +#include "include/kernels/rep_forces.h" __global__ void compute_repulsive_forces_kernel( diff --git a/src/util/data_utils.cu b/src/util/data_utils.cu index 16d2fc7..3511ec0 100644 --- a/src/util/data_utils.cu +++ b/src/util/data_utils.cu @@ -7,7 +7,7 @@ * Copyright (c) 2018, Regents of the University of Californias */ -#include "../include/util/data_utils.h" +#include "include/util/data_utils.h" float* tsnecuda::util::LoadMnist(std::string file_name, int32_t& num_images, int32_t& num_rows, int32_t& num_columns) { From 89518c23861cf4782e8a1d4bdeff0af128c43f9d Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 18:20:27 -0700 Subject: [PATCH 13/14] remove relative imports --- src/exe/main.cu | 8 ++------ src/ext/pymodule_ext.cu | 2 +- src/include/ext/pymodule_ext.h | 8 ++++---- src/kernels/attr_forces.cu | 2 +- src/kernels/perplexity_search.cu | 2 +- src/util/cuda_utils.cu | 2 +- 6 files changed, 10 insertions(+), 14 deletions(-) diff --git a/src/exe/main.cu b/src/exe/main.cu index a3432f1..39a5cdf 100644 --- a/src/exe/main.cu +++ b/src/exe/main.cu @@ -2,12 +2,8 @@ // args, so we don't have to re-build to change options. // Detailed includes -// #include "../include/common.h" -#include "../include/util/data_utils.h" -// #include "../include/util/cuda_utils.h" -// #include "../include/util/random_utils.h" -// #include "../include/util/distance_utils.h" -#include "../include/fit_tsne.h" +#include "include/util/data_utils.h" +#include "include/fit_tsne.h" #include #include diff --git a/src/ext/pymodule_ext.cu b/src/ext/pymodule_ext.cu index 7c2dff3..9a768d7 100644 --- a/src/ext/pymodule_ext.cu +++ b/src/ext/pymodule_ext.cu @@ -1,7 +1,7 @@ // Implementation file for the python extensions -#include "../include/ext/pymodule_ext.h" +#include "ext/pymodule_ext.h" void pymodule_bh_tsne(float *result, float* points, diff --git a/src/include/ext/pymodule_ext.h b/src/include/ext/pymodule_ext.h index ef9f6e9..56bd7f6 100644 --- a/src/include/ext/pymodule_ext.h +++ b/src/include/ext/pymodule_ext.h @@ -2,10 +2,10 @@ #define PYMODULE_EXT_H #include - #include "../common.h" - #include "../options.h" - #include "../util/distance_utils.h" - #include "../fit_tsne.h" + #include "common.h" + #include "options.h" + #include "util/distance_utils.h" + #include "fit_tsne.h" extern "C" { /** diff --git a/src/kernels/attr_forces.cu b/src/kernels/attr_forces.cu index 1d83778..498c197 100644 --- a/src/kernels/attr_forces.cu +++ b/src/kernels/attr_forces.cu @@ -6,7 +6,7 @@ Attractive force is given by pij*qij. */ -#include "../include/kernels/attr_forces.h" +#include "kernels/attr_forces.h" __global__ void tsnecuda::ComputePijxQijKernel( diff --git a/src/kernels/perplexity_search.cu b/src/kernels/perplexity_search.cu index 884979f..ed16529 100644 --- a/src/kernels/perplexity_search.cu +++ b/src/kernels/perplexity_search.cu @@ -5,7 +5,7 @@ Note that FAISS returns the first row as the same point, with distance = 0. pii is defined as zero. */ -#include "../include/kernels/perplexity_search.h" +#include "kernels/perplexity_search.h" __global__ void tsnecuda::ComputePijKernel( diff --git a/src/util/cuda_utils.cu b/src/util/cuda_utils.cu index 78edaf4..579a5fb 100644 --- a/src/util/cuda_utils.cu +++ b/src/util/cuda_utils.cu @@ -31,7 +31,7 @@ or send an e-mail to: info@orangeowlsolutions.com */ -#include "../include/util/cuda_utils.h" +#include "util/cuda_utils.h" /*******************/ /* iDivUp FUNCTION */ From 81b925fe64aedac5953b8450762e682ee2c60527 Mon Sep 17 00:00:00 2001 From: Roshan Rao Date: Mon, 10 Jun 2019 18:25:05 -0700 Subject: [PATCH 14/14] re-add the different architectures again --- CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 20e04d7..16bd30e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,13 +58,13 @@ SET(CUDA_PROPAGATE_HOST_FLAGS OFF) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -O3 -Xptxas -dlcm=cg - # -gencode=arch=compute_30,code=sm_30 - # -gencode=arch=compute_35,code=sm_35 - # -gencode=arch=compute_50,code=sm_50 - # -gencode=arch=compute_52,code=sm_52 - # -gencode=arch=compute_60,code=sm_60 + -gencode=arch=compute_30,code=sm_30 + -gencode=arch=compute_35,code=sm_35 + -gencode=arch=compute_50,code=sm_50 + -gencode=arch=compute_52,code=sm_52 + -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 - # -gencode=arch=compute_70,code=sm_70 + -gencode=arch=compute_70,code=sm_70 -std=c++11 -Xcompiler '-O3' -Xcompiler '-fPIC'