diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 44e174e0..5813de26 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,7 +12,7 @@ set(EXAMPLES "primes" "BFS" "word_counts" "tokens" "filter" "linefit" "knuth_morris_pratt" "huffman_tree" "decision_tree_c45" "karatsuba" "suffix_tree" "2d_linear_program" "box_kdtree" "radix_tree" "ray_trace" "hash_map" "oct_tree" "3d_range" "rectangle_intersection" "star_connectivity" "ldd_connectivity" "boruvka" - "counting_sort") + "counting_sort" "integer_sort") function(add_example NAME) add_executable(${NAME} ${NAME}.cpp) diff --git a/examples/counting_sort.h b/examples/counting_sort.h index 32e7e12e..82b94df4 100644 --- a/examples/counting_sort.h +++ b/examples/counting_sort.h @@ -5,30 +5,53 @@ #include #include -template -parlay::sequence -counting_sort(const InIt& begin, const InIt& end, - OutIt out, const KeyIt& keys, +// ************************************************************** +// Counting sort +// A parallel version of counting sort. It breaks the input into +// partitions and for each partition, in parallel, it counts how many +// of each key there are. It then using scan to calculate the offsets +// for each bucket in each partition, and does a final pass placing +// all keys in their correct position. +// ************************************************************** + +using counter_type = unsigned long; + +// ************************************************************** +// Input: +// begin and end iterators for the values to be rearranged +// begin iterator for the output (value_type must be the same) +// begin iterator for the keys (range must be same length as values) +// num_buckets : number of buckets (should be smallish, e.g. 256) +// Output: +// Offsets within output of each key. Will be of length +// num_buckets+1 since last entry will contain total size +// (i.e. end-begin). +// ************************************************************** +template +parlay::sequence +counting_sort(const InIt& begin, const InIt& end, + OutIt out, const KeyIt& keys, long num_buckets) { long n = end - begin; - long num_parts = n / (num_buckets * 64) + 1; + if (n == 0) return parlay::sequence(1, 0); + long num_parts = std::min(1000l, n / (num_buckets * 64) + 1); long part_size = (n - 1)/num_parts + 1; // first count buckets within each partition - auto counts = parlay::sequence::uninitialized(num_buckets * num_parts); + auto counts = parlay::sequence::uninitialized(num_buckets * num_parts); parlay::parallel_for(0, num_parts, [&] (long i) { long start = i * part_size; long end = std::min(start + part_size, n); - for (int j = 0; j < num_buckets; j++) counts[i*num_buckets + j] = 0; - for (size_t j = start; j < end; j++) counts[i*num_buckets + keys[j]]++; + for (long j = 0; j < num_buckets; j++) counts[i*num_buckets + j] = 0; + for (long j = start; j < end; j++) counts[i*num_buckets + keys[j]]++; }, 1); // transpose the counts if more than one part - parlay::sequence trans_counts; - if (num_parts > 1) { - trans_counts = parlay::sequence::uninitialized(num_buckets * num_parts); - parlay::parallel_for(0, num_buckets, [&] (long i) { - for (size_t j = 0; j < num_parts; j++) + parlay::sequence trans_counts; + if (num_parts > 1) { + trans_counts = parlay::sequence::uninitialized(num_buckets * num_parts); + parlay::parallel_for(0, num_buckets, [&] (long i) { + for (size_t j = 0; j < num_parts; j++) trans_counts[i* num_parts + j] = counts[j * num_buckets + i];}, 1); } else trans_counts = std::move(counts); @@ -39,19 +62,32 @@ counting_sort(const InIt& begin, const InIt& end, parlay::parallel_for(0, num_parts, [&] (long i) { long start = i * part_size; long end = std::min(start + part_size, n); - int local_offsets[num_buckets]; + parlay::sequence local_offsets(num_buckets); // transpose back - for (int j = 0; j < num_buckets; j++) + for (long j = 0; j < num_buckets; j++) local_offsets[j] = trans_counts[num_parts * j + i]; // copy to output - for (size_t j = start; j < end; j++) { - int k = local_offsets[keys[j]]++; + for (long j = start; j < end; j++) { + counter_type k = local_offsets[keys[j]]++; + // prefetching speeds up the code + #if defined(__GNUC__) || defined(__clang__) __builtin_prefetch (((char*) &out[k]) + 64); + #endif out[k] = begin[j]; }}, 1); - return parlay::tabulate(num_buckets, [&] (long i) { - return trans_counts[i * num_parts];}); + return parlay::tabulate(num_buckets+1, [&] (long i) { + return (i == num_buckets) ? (counter_type) n : trans_counts[i * num_parts];}); +} + +// A version that uses ranges as inputs and generates its own output sequence +template +auto counting_sort(const InRange& in, const KeysRange& keys, + long num_buckets) { + auto out = parlay::sequence::uninitialized(in.size()); + auto offsets = counting_sort(in.begin(), in.end(), out.begin(), keys.begin(), + num_buckets); + return std::pair(std::move(out), std::move(offsets)); } diff --git a/examples/samplesort.h b/examples/samplesort.h index eac04901..99852a08 100644 --- a/examples/samplesort.h +++ b/examples/samplesort.h @@ -9,6 +9,7 @@ #include #include "helper/heap_tree.h" +#include "counting_sort.h" // ************************************************************** // Sample sort @@ -29,7 +30,7 @@ void sample_sort_(Range in, Range out, Less less, int level=1) { long cutoff = 256; if (n <= cutoff || level > 2) { parlay::copy(in, out); - std::stable_sort(out.begin(), out.end()); + std::sort(out.begin(), out.end()); return; } @@ -58,7 +59,7 @@ void sample_sort_(Range in, Range out, Less less, int level=1) { return ss.find(in[i], less);}); // sort into the buckets - auto [keys,offsets] = parlay::internal::count_sort(in, bucket_ids, num_buckets); + auto [keys,offsets] = counting_sort(in, bucket_ids, num_buckets); // now recursively sort each bucket parlay::parallel_for(0, num_buckets, [&, &keys = keys, &offsets = offsets] (long i) {