diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 44e174e0..5813de26 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -12,7 +12,7 @@ set(EXAMPLES "primes" "BFS" "word_counts" "tokens" "filter" "linefit"
   "knuth_morris_pratt" "huffman_tree" "decision_tree_c45" "karatsuba" "suffix_tree"
   "2d_linear_program" "box_kdtree" "radix_tree" "ray_trace" "hash_map" "oct_tree"
   "3d_range" "rectangle_intersection" "star_connectivity" "ldd_connectivity" "boruvka"
-  "counting_sort")
+  "counting_sort" "integer_sort")
 
 function(add_example NAME)
   add_executable(${NAME} ${NAME}.cpp)
diff --git a/examples/counting_sort.h b/examples/counting_sort.h
index 32e7e12e..82b94df4 100644
--- a/examples/counting_sort.h
+++ b/examples/counting_sort.h
@@ -5,30 +5,53 @@
 #include <parlay/sequence.h>
 #include <parlay/primitives.h>
 
-template <typename InIt, typename OutIt, typename KeyIt>                                                    
-parlay::sequence<int>                                                                                       
-counting_sort(const InIt& begin, const InIt& end,                                                           
-              OutIt out, const KeyIt& keys,                                                                 
+// **************************************************************
+// Counting sort
+// A parallel version of counting sort.  It breaks the input into
+// partitions and for each partition, in parallel, it counts how many
+// of each key there are.  It then using scan to calculate the offsets
+// for each bucket in each partition, and does a final pass placing
+// all keys in their correct position.
+// **************************************************************
+
+using counter_type = unsigned long;
+
+// **************************************************************
+// Input:
+//   begin and end iterators for the values to be rearranged
+//   begin iterator for the output (value_type must be the same)
+//   begin iterator for the keys (range must be same length as values)
+//   num_buckets : number of buckets (should be smallish, e.g. 256)
+// Output:
+//   Offsets within output of each key.  Will be of length
+//   num_buckets+1 since last entry will contain total size
+//   (i.e. end-begin).
+// **************************************************************
+template <typename InIt, typename OutIt, typename KeyIt>
+parlay::sequence<counter_type>
+counting_sort(const InIt& begin, const InIt& end,                                                    
+              OutIt out, const KeyIt& keys,
               long num_buckets) {
   long n = end - begin;
-  long num_parts = n / (num_buckets * 64) + 1;
+  if (n == 0) return parlay::sequence<counter_type>(1, 0);
+  long num_parts = std::min(1000l, n / (num_buckets * 64) + 1);
   long part_size = (n - 1)/num_parts + 1;
 
   // first count buckets within each partition
-  auto counts = parlay::sequence<int>::uninitialized(num_buckets * num_parts);
+  auto counts = parlay::sequence<counter_type>::uninitialized(num_buckets * num_parts);
   parlay::parallel_for(0, num_parts, [&] (long i) {
     long start = i * part_size;
     long end = std::min<long>(start + part_size, n);
-    for (int j = 0; j < num_buckets; j++) counts[i*num_buckets + j] = 0;
-    for (size_t j = start; j < end; j++) counts[i*num_buckets + keys[j]]++;
+    for (long j = 0; j < num_buckets; j++) counts[i*num_buckets + j] = 0;
+    for (long j = start; j < end; j++) counts[i*num_buckets + keys[j]]++;
   }, 1);
 
    // transpose the counts if more than one part                                                             
-  parlay::sequence<int> trans_counts;                                                                       
-  if (num_parts > 1) {                                                                                      
-    trans_counts = parlay::sequence<int>::uninitialized(num_buckets * num_parts);                           
-    parlay::parallel_for(0, num_buckets, [&] (long i) {                                                     
-      for (size_t j = 0; j < num_parts; j++)                                                                
+  parlay::sequence<counter_type> trans_counts;                                                                       
+  if (num_parts > 1) {
+    trans_counts = parlay::sequence<counter_type>::uninitialized(num_buckets * num_parts);
+    parlay::parallel_for(0, num_buckets, [&] (long i) {
+      for (size_t j = 0; j < num_parts; j++)
 	trans_counts[i* num_parts + j] = counts[j * num_buckets + i];}, 1);
   } else trans_counts = std::move(counts);
 
@@ -39,19 +62,32 @@ counting_sort(const InIt& begin, const InIt& end,
   parlay::parallel_for(0, num_parts, [&] (long i) {
     long start = i * part_size;
     long end = std::min<long>(start + part_size, n);
-    int local_offsets[num_buckets];
+    parlay::sequence<counter_type> local_offsets(num_buckets);
 
     // transpose back
-    for (int j = 0; j < num_buckets; j++)
+    for (long j = 0; j < num_buckets; j++)
        local_offsets[j] = trans_counts[num_parts * j + i];
 
     // copy to output
-    for (size_t j = start; j < end; j++) {
-      int k = local_offsets[keys[j]]++;
+    for (long j = start; j < end; j++) {
+      counter_type k = local_offsets[keys[j]]++;
+      // prefetching speeds up the code
+      #if defined(__GNUC__) || defined(__clang__)
       __builtin_prefetch (((char*) &out[k]) + 64);
+      #endif
       out[k] = begin[j];
     }}, 1);
 
-  return parlay::tabulate(num_buckets, [&] (long i) {                                                       
-    return trans_counts[i * num_parts];});   
+  return parlay::tabulate(num_buckets+1, [&] (long i) {
+    return (i == num_buckets) ? (counter_type) n : trans_counts[i * num_parts];});
+}
+
+// A version that uses ranges as inputs and generates its own output sequence
+template <typename InRange, typename KeysRange>
+auto counting_sort(const InRange& in, const KeysRange& keys,
+		   long num_buckets) {
+  auto out = parlay::sequence<typename InRange::value_type>::uninitialized(in.size());
+  auto offsets = counting_sort(in.begin(), in.end(), out.begin(), keys.begin(),
+			       num_buckets);
+  return std::pair(std::move(out), std::move(offsets));
 }
diff --git a/examples/samplesort.h b/examples/samplesort.h
index eac04901..99852a08 100644
--- a/examples/samplesort.h
+++ b/examples/samplesort.h
@@ -9,6 +9,7 @@
 #include <parlay/utilities.h>
 
 #include "helper/heap_tree.h"
+#include "counting_sort.h"
 
 // **************************************************************
 // Sample sort
@@ -29,7 +30,7 @@ void sample_sort_(Range in, Range out, Less less, int level=1) {
   long cutoff = 256;
   if (n <= cutoff || level > 2) {
     parlay::copy(in, out);
-    std::stable_sort(out.begin(), out.end());
+    std::sort(out.begin(), out.end());
     return;
   }
 
@@ -58,7 +59,7 @@ void sample_sort_(Range in, Range out, Less less, int level=1) {
     return ss.find(in[i], less);});
 
   // sort into the buckets
-  auto [keys,offsets] = parlay::internal::count_sort(in, bucket_ids, num_buckets);
+  auto [keys,offsets] = counting_sort(in, bucket_ids, num_buckets);
 
   // now recursively sort each bucket
   parlay::parallel_for(0, num_buckets, [&, &keys = keys, &offsets = offsets] (long i) {