-
Notifications
You must be signed in to change notification settings - Fork 0
/
ya-sort2.c
1309 lines (1215 loc) · 52.8 KB
/
ya-sort2.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* ya-sort2.c
=========
yasort2() sorts an array of numbers and moves the contents of another array of the same size around so the "pairs" are kept together.
Eg if one array has x co-ords and the 2nd has y co-ords you can sort the x co-ords while keeping the x-y pairs together at the same array index.
It uses a quicksort with the addition of introsort functionality to give both a fast average execution time and o(n*log(n)) worse case execution time.
See "Introspective sorting and selection algorithms" by D.R.Musser,Software practice and experience, 8:983-993, 1997.
ya2sort can only sort numbers.
yasort2() takes ~ 71% longer than yasort() which is not surprising as it has to move twice the amount of data around.
It uses yasort2.h to define the type of data that will be sorted ( elem_type_sort2 ).
If PAR_SORT is defined (see below) then this code uses tasks to use all available processors to speed up sorting
On a simple test enabling this with a 2 processor system sped up sorting by 1.5*
*/
/*
Copyright (c) 2021,2022 Peter Miller
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#ifndef __BORLANDC__
#define NDEBUG /* if defined then asserts become "nothing" */
#endif
// #define inline /* builder C++ v5 does not suport inline */
// #define YA2SORT_TEST_PROGRAM /* if defined compile a test program for this code - make sure that YASORT_MEDIAN_TEST_PROGRAM is NOT defined in ya-sort.c ! */
/* For the test program, you also need to ensure the definitions of elem_type_median (in yamedian.h) and elem_type_sort2 (in yasort2.h) are the same (normally double is used for testing) */
#define PAR_SORT /* if defined create a parallel sort using threads */
// #define DEBUG /* if defined then add a few printf's so you can see whats happening, can be useful if INTROSORT_MULT needs to be tuned, but oherwise not needed */
// you should not need to edit anything below here for normal use of this software (you may need to edit yasort2.h)
// #define USE_PTHREADS /* if defined use pthreads for multitasking - otherwise if running under windows use native windows threads. Pthreads is slightly slower than native Windows threads on Windows */
#include <time.h>
#include <stdint.h>
#include <limits.h>
#include <stdbool.h>
#include <string.h>
#if defined(DEBUG) || defined (YA2SORT_TEST_PROGRAM )
#include <stdio.h>
#include <time.h>
#include "hr_timer.h"
#endif
#include <assert.h>
#ifdef PAR_SORT
#if defined _WIN32
#include <process.h> /* for _beginthreadex */
#include <windows.h> /* for number of processors */
#elif defined __GNUC__
#include <unistd.h> /* to get number of processors on OS's other than Windows */
#else
#error "Parallel sorting not supported for this complier/OS (undefine PAR_SORT to avoid this error)"
#endif
#ifdef USE_PTHREADS
#include <pthread.h>
#endif
#endif
#include "yasort2.h" /* defines elem_type_sort2 etc */
#define INTROSORT_MULT 2 /* defines when we swap to heapsort 3 means only do so very rarely, 0 means "always" use heapsort. All positive integer values (including 0) will give o(n*log(n)) worse case execution time
on the 1st step of the test program yasort2(100000001) sorting doubles the following execution speeds were obtained
with DUAL_PARTITION on yasort2 we get
INTROSORT_MULT Time(secs)
0 - heapsort already used (slow).
1 112.687 - heapsort never used, mid-range used more than at 2
2 112.907 - heapsort never used, mid-range used for some examples
3 112.925 - heapsort never used, mid range used fo 1 example
*/
#ifdef YA2SORT_TEST_PROGRAM
#ifdef DEBUG
#ifdef __BORLANDC__
#define dprintf(...) crprintf(__VA_ARGS__) /* convert to crprintf when DEBUG defined */
#else
#define dprintf(...) printf(__VA_ARGS__) /* convert to printf when DEBUG defined */
#endif
#else
#define dprintf(...) /* nothing - only prints when DEBUG defined */
#endif
#endif
#if !defined(PAR_SORT) || defined(YA2SORT_TEST_PROGRAM )
#define P_UNUSED(x) (void)x /* a way to avoid warning unused parameter messages from the compiler */
#endif
#if defined(YA2SORT_TEST_PROGRAM) || !defined(NDEBUG)
static bool check_sort( elem_type_sort2 *a, size_t n); // check result of sort is ordered correctly
#endif
static void heapsort2(elem_type_sort2 *a,elem_type_sort2 *b,size_t n); // backup sort automatically used when required.
#if defined __GNUC__
static inline int ilog2(size_t x) { return 63 - __builtin_clzll(x); }
#elif defined _WIN32 && !defined __BORLANDC__
#include <intrin.h>
static inline int ilog2(size_t x)
{
unsigned long i = 0;
_BitScanReverse(&i, x);
return i;
}
#elif defined _WIN64 && !defined __BORLANDC__
#include <intrin.h>
static inline int ilog2(size_t x)
{
unsigned long i = 0;
_BitScanReverse64(&i, x);
return i;
}
#else // version in standard C , this is slower than above optimised versions but portable
/* algorithm from https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers */
#if (defined __BORLANDC__ && defined _WIN64) || ( defined __SIZEOF_POINTER__ && __SIZEOF_POINTER__ == 8)
static inline int ilog2(size_t x) // unsigned 64 bit version
{
#define S(k) if (x >= (UINT64_C(1) << k)) { i += k; x >>= k; }
int i = 0;
S(32);
S(16);
S(8);
S(4);
S(2);
S(1);
return i;
#undef S
}
#elif (defined __BORLANDC__ && defined __WIN32__) || ( defined __SIZEOF_POINTER__ && __SIZEOF_POINTER__ == 4 )
static inline int ilog2(size_t x) // unsigned 32 bit version
{
#define S(k) if (x >= (UINT32_C(1) << k)) { i += k; x >>= k; }
int i = 0;
S(16);
S(8);
S(4);
S(2);
S(1);
return i;
#undef S
}
#else
#error "unknown pointer size - expected 4 (32 bit) or 8 (64 bit) "
#endif
#endif
#define cswap(i,j) {elem_type_sort2 _t;_t=(i);i=(j);j=_t;} // example call cswap(r[i], r[minIndex]); WARNING - side effects could be an issue here ! eg cswap(r[i++],r[b])
/* note that an intelligent swap (that only writes back if the value is different) is slightly slower in the test program */
#ifdef YA2SORT_TEST_PROGRAM
inline static void eswap1(size_t i,size_t j,elem_type_sort2 *a) /* swap a[i] and a[j] */
{
cswap(a[i],a[j]); // use swap macro
}
#endif
inline static void eswap2(size_t i,size_t j,elem_type_sort2 *a,elem_type_sort2 *b) /* swap a[i] and a[j] also swap b[i] and b[j] */
{
cswap(a[i],a[j]); // use swap macro on array a
cswap(b[i],b[j]); // use swap macro on array b
}
inline static void eswap2p(elem_type_sort2 *pi,elem_type_sort2 *pj,elem_type_sort2 *a,elem_type_sort2 *b) /* swap pi and pj also swap b[i] and b[j] */
{
cswap(*pi,*pj); // use swap macro on array a
cswap(*(pi-a+b),*(pj-a+b)); // use swap macro on array b
}
#define elem_type_ss elem_type_sort2 /* set type for smallsort correctly */
#define _yaSORT2 /* tell smallsort we have 2 arrays */
#include "ya-smallsort.h" // contains small_sort() - this needs to be included after cswap is defined
#ifdef YA2SORT_TEST_PROGRAM
/* replacement for library rand() function that has known (good) properties
The default generator in tdm-gcc 10.3.0 has RAND_MAX for 32767 which is not ideal.
Public domain code for JKISS RNG from http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf
claims any of the 2 generators combined still pass all the Dieharder tests (and obviously all 3 combined do).
It also passes the complete BigCrunch test set in testU01.
Its period is ~ 2^127 (1.7e38).
*/
static uint32_t x=123456789,y=987654321,z=43219876,c=6543217;/* Seed variables */
static uint32_t JKISS(void)
{
uint64_t t;
x=314527869*x+1234567;
y^=y<<5;
y^=y>>7;
y^=y<<22;
t =4294584393UL*z+c;
c= t>>32;
z= t;
return x+y+z;
}
static int rand(void)
{return JKISS() & INT_MAX; // convert unsigned to signed
}
static void srand(unsigned int seed)
{ // seed random number generator, 0 gives the same sequence as you get without calling srand()
x=123456789+seed;
y=987654321+seed;
z=43219876+seed;
c=6543217+seed;
}
#ifdef RAND_MAX
#undef RAND_MAX
#endif
#define RAND_MAX INT_MAX /* in case its used */
#ifdef RAND_MAX
#undef RAND_MAX
#endif
#define RAND_MAX INT_MAX /* in case its used */
static int bi_modal_rand(void) /* returns a random number with a "hole" in the distribution so the mean & (max-min)/2 are significantly different to the median */
{int r=rand();
while(r>RAND_MAX/4 && r<RAND_MAX/2) r=rand(); // skip values that fall in the "hole" from RAND_MAX/4 to RAND_MAX/2. "hole" is off centre to make distribution non-symetrical.
return r;
}
#endif
#ifndef min /* with gcc this is already defined */
#define min(a, b) (a) < (b) ? (a) : (b) /* warning a,b evaluated more than once ! */
#endif
#if defined(YA2SORT_TEST_PROGRAM) || !defined(NDEBUG)
static bool check_sort( elem_type_sort2 *a, size_t n) // check result of sort is ordered correctly
{// returns true if sort is ok
/* check array actually is sorted correctly in increasing order */
size_t errs=0;
if(n<2) return true;
{size_t i;
for(i=0;i<n-1;++i)
if(a[i+1]<a[i])
errs++;
}
return (errs==0);
}
#endif
/*
* From:
* Fast median search: an ANSI C implementation
* Nicolas Devillard - ndevilla AT free DOT fr
* July 1998
*
* The following routines have been built from knowledge gathered
* around the Web. I am not aware of any copyright problem with
* them, so use it as you want.
* N. Devillard - 1998
* The functions below have been edited by Peter Miller 9/2021 to fit in with the rest of the code here.
*/
#define Zv(a,b) { if ((a)>(b)) cswap((a),(b)); }
/*----------------------------------------------------------------------------
Function : opt_med9()
In : pointer to an array of 9 elem_type_sort2 values
Out : an elem_type_sort2
Job : optimized search of the median of 9 elem_type_sort2 values
Notice : in theory, cannot go faster without assumptions on the signal.
Formula from:
XILINX XCELL magazine, vol. 23 by John L. Smith
The input array is modified in the process
The result array is guaranteed to contain the median
value in middle position, but other elements are NOT fully sorted sorted.
---------------------------------------------------------------------------*/
static elem_type_sort2 opt_med9(elem_type_sort2 * p)
{
Zv(p[1], p[2]) ; Zv(p[4], p[5]) ; Zv(p[7], p[8]) ;
Zv(p[0], p[1]) ; Zv(p[3], p[4]) ; Zv(p[6], p[7]) ;
Zv(p[1], p[2]) ; Zv(p[4], p[5]) ; Zv(p[7], p[8]) ;
Zv(p[0], p[3]) ; Zv(p[5], p[8]) ; Zv(p[4], p[7]) ;
Zv(p[3], p[6]) ; Zv(p[1], p[4]) ; Zv(p[2], p[5]) ;
Zv(p[4], p[7]) ; Zv(p[4], p[2]) ; Zv(p[6], p[4]) ;
Zv(p[4], p[2]) ; return(p[4]) ;
}
/*----------------------------------------------------------------------------
Function : opt_med25()
In : pointer to an array of 25 elem_type_sort2 values
Out : an elem_type_sort2
Job : optimized search of the median of 25 elem_type_sort2 values
The input array is modified in the process
The result array is guaranteed to contain the median
value in middle position, but other elements are NOT fully sorted sorted.
Notice : in theory, cannot go faster without assumptions on the signal.
Code taken from Graphic Gems.
---------------------------------------------------------------------------*/
static elem_type_sort2 opt_med25(elem_type_sort2 * p)
{
Zv(p[0], p[1]) ; Zv(p[3], p[4]) ; Zv(p[2], p[4]) ;
Zv(p[2], p[3]) ; Zv(p[6], p[7]) ; Zv(p[5], p[7]) ;
Zv(p[5], p[6]) ; Zv(p[9], p[10]) ; Zv(p[8], p[10]) ;
Zv(p[8], p[9]) ; Zv(p[12], p[13]) ; Zv(p[11], p[13]) ;
Zv(p[11], p[12]) ; Zv(p[15], p[16]) ; Zv(p[14], p[16]) ;
Zv(p[14], p[15]) ; Zv(p[18], p[19]) ; Zv(p[17], p[19]) ;
Zv(p[17], p[18]) ; Zv(p[21], p[22]) ; Zv(p[20], p[22]) ;
Zv(p[20], p[21]) ; Zv(p[23], p[24]) ; Zv(p[2], p[5]) ;
Zv(p[3], p[6]) ; Zv(p[0], p[6]) ; Zv(p[0], p[3]) ;
Zv(p[4], p[7]) ; Zv(p[1], p[7]) ; Zv(p[1], p[4]) ;
Zv(p[11], p[14]) ; Zv(p[8], p[14]) ; Zv(p[8], p[11]) ;
Zv(p[12], p[15]) ; Zv(p[9], p[15]) ; Zv(p[9], p[12]) ;
Zv(p[13], p[16]) ; Zv(p[10], p[16]) ; Zv(p[10], p[13]) ;
Zv(p[20], p[23]) ; Zv(p[17], p[23]) ; Zv(p[17], p[20]) ;
Zv(p[21], p[24]) ; Zv(p[18], p[24]) ; Zv(p[18], p[21]) ;
Zv(p[19], p[22]) ; Zv(p[8], p[17]) ; Zv(p[9], p[18]) ;
Zv(p[0], p[18]) ; Zv(p[0], p[9]) ; Zv(p[10], p[19]) ;
Zv(p[1], p[19]) ; Zv(p[1], p[10]) ; Zv(p[11], p[20]) ;
Zv(p[2], p[20]) ; Zv(p[2], p[11]) ; Zv(p[12], p[21]) ;
Zv(p[3], p[21]) ; Zv(p[3], p[12]) ; Zv(p[13], p[22]) ;
Zv(p[4], p[22]) ; Zv(p[4], p[13]) ; Zv(p[14], p[23]) ;
Zv(p[5], p[23]) ; Zv(p[5], p[14]) ; Zv(p[15], p[24]) ;
Zv(p[6], p[24]) ; Zv(p[6], p[15]) ; Zv(p[7], p[16]) ;
Zv(p[7], p[19]) ; Zv(p[13], p[21]) ; Zv(p[15], p[23]) ;
Zv(p[7], p[13]) ; Zv(p[7], p[15]) ; Zv(p[1], p[9]) ;
Zv(p[3], p[11]) ; Zv(p[5], p[17]) ; Zv(p[11], p[17]) ;
Zv(p[9], p[17]) ; Zv(p[4], p[10]) ; Zv(p[6], p[12]) ;
Zv(p[7], p[14]) ; Zv(p[4], p[6]) ; Zv(p[4], p[7]) ;
Zv(p[12], p[14]) ; Zv(p[10], p[14]) ; Zv(p[6], p[7]) ;
Zv(p[10], p[12]) ; Zv(p[6], p[10]) ; Zv(p[6], p[17]) ;
Zv(p[12], p[17]) ; Zv(p[7], p[17]) ; Zv(p[7], p[10]) ;
Zv(p[12], p[18]) ; Zv(p[7], p[12]) ; Zv(p[10], p[18]) ;
Zv(p[12], p[20]) ; Zv(p[10], p[20]) ; Zv(p[10], p[12]) ;
return (p[12]);
}
/* end of N. Devillard functions */
/*
heapsort() is used within yasort2 as part of its introsort functionality (ie when it struggles to obtain O(n*log(n)) execution time).
Heapsort has a guaranteed O(n*log(n)) execution time.
It is however a LOT slower than yasort2 on the test suite (~ 6* based on the total test runtime)
Note that in practice heapsort is normally used very little so its slower speed is not an issue.
The advantage of using a heapsort here is that it makes the sorting code completely independent of the median code,
ya_msort (in ya-select.c) could be used in its place and it a lot faster - but this makes no difference to the overall execution time of yasort2().
*/
// heapsort based on code at http://www.codecodex.com/wiki/Heapsort#C.2FC.2B.2B - which itself is based on the numerical recipees algorithm
// This version supports elem_type_sort2 (ie will sort any type that can be compared ).
// This sorts array a[] keeping values in b[] in the same relative place
static void heapsort2 (elem_type_sort2 *a,elem_type_sort2 *b, size_t n)
{
size_t i = n/2, parent, child;
elem_type_sort2 t,t2;
for (;;)
{ /* Loops until a[] is sorted */
if (i > 0)
{ /* First stage - creating the heap , highest value will end up in a[0] */
i--;
t = a[i]; /* Save parent value to t */
t2= b[i];
}
else
{ /* Second stage (when i==0) - Extracting elements in-place */
n--; /* Make the new heap smaller */
if (n == 0) return; /* When the heap is empty, we are done */
t = a[n]; /* Save last value (it will be overwritten on the line below) */
a[n] = a[0]; /* Save largest value at the end of arr */
t2= b[n];
b[n] = b[0];
}
parent = i; /* We will start pushing down t from parent */
child = parent*2 + 1; /* parent's left child */
/* Sift operation - pushing the value of t down the heap */
while (child < n)
{ // Choose the largest child - assume left child and adjust if its the right one
if (child + 1 < n && a[child + 1] > a[child])
{
child++; /* right child is the largest */
}
if (a[child] > t)
{ /* If any child is bigger than the parent */
a[parent] = a[child]; /* Move the largest child up */
b[parent] = b[child];
parent = child; /* Move parent pointer to this child */
child = parent*2+1;
}
else
{
break; /* t's place is found */
}
}
a[parent] = t; /* We save t in the heap */
b[parent] = t2;
}
}
// This is a version of quicksort that only partitions into 2 partitions (<= pivot, >= pivot) so inner loop has a simpler structure than one that splits into 3 partitions
// The inner quicksort loop does not degrade into a O(n^2) when there are duplicate values in the array to be sorted (many partitioning schemes do).
// Again it uses introsort techniques to achieve O(n*log2(n)) worse case runtime.
// This version was created by Peter Miller 9/1/2022
static void _yasort2(elem_type_sort2 *x,elem_type_sort2 *y, size_t n,unsigned int nos_p); /* main worker function */
#ifdef PAR_SORT /* helper code for Parallel version */
#define PAR_DIV_N 16 /* divisor on n (current partition size) to check size of partition about to be spawned as a new task is big enough to justify the work of creating a new task */
#define PAR_MIN_N 10000 /* min size of a partition to be spawned as a new task */
struct _params
{elem_type_sort2 *xp;
elem_type_sort2 *yp;
size_t np;
unsigned int nos_p_p;
volatile int task_fin; /* 0 => task running, 1=> task finished. Volatile as set by seperate task. This variable makes porting to pthreads simpler*/
};
#ifdef USE_PTHREADS // void *(*start_routine)(void *)
static void * yasortThreadFunc( void * _Arg ) // parallel thread that can sort a partition
#else
static unsigned __stdcall yasortThreadFunc( void * _Arg ) // parallel thread that can sort a partition
#endif
{struct _params* Arg=_Arg;
Arg->task_fin=0; // This should be set before starting the thread, waiting till now leaves a short time when the thread may appear to have finished but its actually not started... Its done here "just in case".
_yasort2(Arg->xp,Arg->yp,Arg->np,Arg->nos_p_p); // sort required section
Arg->task_fin=1; // indicate task now finished - note it will actually finish when this function returns so this flag just indicates another task can now "wait" on this task finishing
// _endthreadex( 0 ); - _endthreadex is call automatically when we return from this function - see https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/endthread-endthreadex?view=msvc-170
return 0;
}
#ifndef _WIN32
/* this code is for non-Windows systems , beleived to work under Linux, BSD unix and MAC OS */
static int nos_procs(void) /* return the number of logical processors present and enabled */
{ return sysconf(_SC_NPROCESSORS_ONLN);
}
#ifdef YASORT_MEDIAN_TEST_PROGRAM /* only needed for the test program */
static void proc_info(void)
{
printf(" %d processors available\n",nos_procs());
printf(" system - number of cpus is %d\n", sysconf( _SC_NPROCESSORS_CONF));
printf(" system - enable number of cpus is %d\n", sysconf(_SC_NPROCESSORS_ONLN));
}
#endif
#else
/* we ideally need to know the number of processors - the code below gets this for windows */
/* this is from https://docs.microsoft.com/de-de/windows/win32/api/sysinfoapi/nf-sysinfoapi-getlogicalprocessorinformation with minor changes by Peter Miller */
typedef BOOL (WINAPI *LPFN_GLPI)(
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION,
PDWORD);
// Helper function to count set bits in the processor mask.
static DWORD CountSetBits(ULONG_PTR bitMask)
{
DWORD LSHIFT = sizeof(ULONG_PTR)*8 - 1;
DWORD bitSetCount = 0;
ULONG_PTR bitTest = (ULONG_PTR)1 << LSHIFT;
DWORD i;
for (i = 0; i <= LSHIFT; ++i)
{
bitSetCount += ((bitMask & bitTest)?1:0);
bitTest/=2;
}
return bitSetCount;
}
#ifdef YA2SORT_TEST_PROGRAM /* only needed for the test program */
static void proc_info(void)
{
BOOL done = FALSE;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL;
DWORD returnLength = 0;
DWORD logicalProcessorCount = 0;
DWORD numaNodeCount = 0;
DWORD processorCoreCount = 0;
DWORD processorL1CacheCount = 0;
DWORD processorL2CacheCount = 0;
DWORD processorL3CacheCount = 0;
DWORD processorPackageCount = 0;
DWORD byteOffset = 0;
PCACHE_DESCRIPTOR Cache;
while (!done) /* we need to call GetLogicalProcessorInformation() twice, the 1st time it tells us how big a buffer we need to supply */
{
DWORD rc = GetLogicalProcessorInformation(buffer, &returnLength);
if (FALSE == rc)
{
if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
{
if (buffer)
free(buffer);
buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(
returnLength);
if (NULL == buffer)
{
printf("\nError: Allocation failure\n");
return ;
}
}
else
{
printf("\nError %d\n", (int)GetLastError());
return ;
}
}
else
{
done = TRUE;
}
}
ptr = buffer;
while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength)
{
switch (ptr->Relationship)
{
case RelationNumaNode:
// Non-NUMA systems report a single record of this type.
numaNodeCount++;
break;
case RelationProcessorCore:
processorCoreCount++;
// A hyperthreaded core supplies more than one logical processor.
logicalProcessorCount += CountSetBits(ptr->ProcessorMask);
break;
case RelationCache:
// Cache data is in ptr->Cache, one CACHE_DESCRIPTOR structure for each cache.
Cache = &ptr->Cache;
if (Cache->Level == 1)
{
processorL1CacheCount++;
}
else if (Cache->Level == 2)
{
processorL2CacheCount++;
}
else if (Cache->Level == 3)
{
processorL3CacheCount++;
}
break;
case RelationProcessorPackage:
// Logical processors share a physical package.
processorPackageCount++;
break;
default:
printf("\nError: Unsupported LOGICAL_PROCESSOR_RELATIONSHIP value.\n");
break;
}
byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
ptr++;
}
printf("\n GetLogicalProcessorInformation results:\n");
printf(" Number of NUMA nodes: %d\n",
(int)numaNodeCount);
printf(" Number of physical processor packages: %d\n",
(int)processorPackageCount);
printf(" Number of processor cores: %d\n",
(int)processorCoreCount);
printf(" Number of logical processors: %d\n",
(int)logicalProcessorCount);
printf(" Number of processor L1/L2/L3 caches: %d/%d/%d\n",
(int)processorL1CacheCount,
(int)processorL2CacheCount,
(int)processorL3CacheCount);
free(buffer);
return;
}
#endif
static unsigned int nos_procs(void) /* return the number of logical processors present.
This was created by Peter Miller 15/1/2022 based on above code */
{
BOOL done = FALSE;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL;
DWORD returnLength = 0;
unsigned int logicalProcessorCount = 0;
unsigned int processorCoreCount = 0;
DWORD byteOffset = 0;
while (!done) /* we need to call GetLogicalProcessorInformation() twice, the 1st time it tells us how big a buffer we need to supply */
{
BOOL rc = GetLogicalProcessorInformation(buffer, &returnLength);
if (rc == FALSE )
{
if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
{
if (buffer)
free(buffer);
buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(
returnLength);
if (NULL == buffer)
{
// Error: memoery Allocation failure
return 0;
}
}
else
{ // other (unexpected) error
return 0;
}
}
else
{
done = TRUE;
}
}
ptr = buffer;
while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength)
{
if (ptr->Relationship == RelationProcessorCore)
{
processorCoreCount++;
// A hyperthreaded core supplies more than one logical processor.
logicalProcessorCount += CountSetBits(ptr->ProcessorMask);
}
byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
ptr++;
}
free(buffer);
return logicalProcessorCount; // actual number of processor cores is processorCoreCount
}
#endif
#endif
static void _yasort2(elem_type_sort2 *x,elem_type_sort2 *y, size_t n,unsigned int nos_p ) /* main worker function, nos_p is nos processors available */
{
bool need_max=true,need_min=true;
elem_type_sort2 min, max;
int itn=0;
if(n<=1) return; // avoid trying to take log2 of 0 - anyway an array of length 1 is already sorted
const int max_itn=INTROSORT_MULT*ilog2(n); // max_itn defines point we swap to mid-range pivot, then at 2*max_int we swap to heapsort. if INTROSORT_MULT=0 then "always" use heapsort, 3 means "almost never" use heapsort
min = max = x[0] ;
#ifdef PAR_SORT
struct _params params;
#ifdef USE_PTHREADS
pthread_t *th=NULL;// set to &thread_id when running
pthread_t thread_id;
#else
HANDLE th=NULL; // handle for worker thread (Windows threads)
#endif
#else
P_UNUSED(nos_p); // this param is not used unless PAR_SORT is defined
#endif
while(1) // replace tail recursion with a loop
{
if (n <= 1) goto sortend; // need a common end point as may be using threads in which case we need to wait for them to complete
/* Use a simple (fast) sort for small n [ this is normally an optimal sort (so use n<=32) ] Using small_sort2() is ~ 6% faster than using the insertion sort below for n<=32 */
if(n<=32)
{
small_sort2( x,y, n);
assert(check_sort( x, n) ); // check result of sort is ordered correctly
goto sortend; // need a common end point as may be using threads in which case we need to wait for them to complete
}
// next try an insertion sort, abort this if its taking too much effort
// this efficiently sorts arrays that are almost perfectly sorted already.
// using this gives an average 33% speedup using the test program when MAX_INS_MOVES=2
// There is also a potentially useful side effect of this, if we have to move to using quicksort then 2 values will have been moved which could help break up bad patterns of data.
#define MAX_INS_MOVES 2 /* max allowed number of moves - for the test program 2 is the optimum value */
size_t nos_ins_moves=0;
for (elem_type_sort2 *p=x,*p2=y; p<x+n-1; ++p,++p2)
{
elem_type_sort2 t = p[1],t2=p2[1];
elem_type_sort2 *j = p,*j2=p2;
if(*j>t)
{// out of order
if(++nos_ins_moves>MAX_INS_MOVES) goto do_qsort; // too many moves - swap to qsort
do
{j--;j2--;
} while(j>=x && *j>t);
memmove(j+2,j+1,(size_t)(p-j)*sizeof(elem_type_sort2));// move a portion of array x right by 1 to make space for t
j[1]=t;
memmove(j2+2,j2+1,(size_t)(p2-j2)*sizeof(elem_type_sort2));// move a portion of array y right by 1 to make space for t2
j2[1]=t2;
}
}
assert(check_sort( x, n) ); // check result of sort is ordered correctly
goto sortend; // need a common end point as may be using threads in which case we need to wait for them to complete
do_qsort: ; // ; as label can only be attached to a statement and the line below is a declaration ...
// if we have made too many iterations of this while loop then we need to swap to heapsort
if(++itn>2*max_itn)
{
#ifdef DEBUG
printf("yasort2: using heapsort(%.0f)\n",(double)n);
#endif
heapsort2(x,y,n);
assert(check_sort( x, n) ); // check result of sort is ordered correctly
goto sortend; // need a common end point as may be using threads in which case we need to wait for them to complete
}
/* start of quicksort */
/* The partitioning algorithm used here partitions into 2 groups, <= pivot and >= pivot
The code can also create a partition = pivot in some cases but this partition does not include all values = pivot
*/
/* select pivot into v - if a large number of elements use median of 25 elements otherwise use median of 9 elements.
if we have made > max_itn loops then we use the mid-range for the pivot.
If we get here we know array has >32 elements in it
*/
elem_type_sort2 v; // pivot value
if(itn>max_itn)
{ /* using medians to select the pivot has failed, swap to using the mid-range ( (max-min)/2 ) as the pivot value.
This is the pivot value used by the Torben algorithm - see ya-median.c for more details.
Because this pivot selection method looks at all the values in x[] it always makes some progress and is not sensitive to the "pattern" of the data in x[]
You could also consider this as a radix exchange sort (especially if sorting integers) - see "The art of concurrency, A thread monkey's guide to writing parallel applications", Clay Breshears, 2009, page 182-3
A radix exchange sort has a linear time [ O(n) ] asymptotic complexity.
*/
#ifdef DEBUG
printf("yasort2: using mid-range as pivot (%.0f)\n",(double)n);
#endif
if(need_max && need_min)
{
min = max = x[0] ;
for (elem_type_sort2 *p=x+1; p<x+n; p++)
{// find min & max
elem_type_sort2 t=*p;
if (t<min) min=t;
if (t>max) max=t;
}
}
else if(need_max)
{max = x[0] ;
for (elem_type_sort2 *p=x+1; p<x+n; p++)
{// find max
elem_type_sort2 t=*p;
if (t>max) max=t;
}
}
else if(need_min)/* need_min */
{min = x[0] ;
for (elem_type_sort2 *p=x+1; p<x+n; p++)
{// find min
elem_type_sort2 t=*p;
if (t<min) min=t;
}
}
if(max==min)
{
// only 1 value - so nothing left to sort. This should never happen as the insertion sort would have trapped this and already returned...
goto sortend; // need a common end point as may be using threads in which case we need to wait for them to complete
}
else
{
v= min/2+max/2; // reasonable estimate of median, so use as pivot.
}
}
else if(n>10000 )
{// large number of elements then take median of 25 values
// use 1st 5 middle 5 and last 5 and two sets of 5 inbetween to try and be "cache friendly"
elem_type_sort2 a[25];
const size_t b=(n-1)/2; // middle item
size_t c=b/2;// 1/4 point
// copy values into array a in blocks of 5
memcpy(a,x,5*sizeof(elem_type_sort2)); // to,from,size :start: x[0],x[1],x[2],x[3],x[4]
memcpy(a+5,x+c-2,5*sizeof(elem_type_sort2));// 1/4 : x[c-2],x[c-1],x[c],x[c+1],x[c+2]
memcpy(a+10,x+b-2,5*sizeof(elem_type_sort2)); // middle: x[b-2],x[b-1],x[b],x[b+1],x[b+2]
c+=b;// 3/4 point
memcpy(a+15,x+c-2,5*sizeof(elem_type_sort2)); // 3/4: x[c-2],x[c-1],x[c],x[c+1],x[c+2]
memcpy(a+20,x+n-5,5*sizeof(elem_type_sort2)); // end: x[n-5],x[n-4],x[n-3],x[n-2],x[n-1]
v=opt_med25(a);
}
else
/* Median of 9 items selected uniformly across dataset */
{
elem_type_sort2 a[9];
const size_t b=(n-1)/2; // middle item
const size_t c=(n-1)/9;
a[0]=x[0];
a[1]=x[2*c];
a[2]=x[3*c];
a[3]=x[4*c];
a[4]=x[b];
a[5]=x[6*c];
a[6]=x[7*c];
a[7]=x[8*c];
a[8]=x[n-1];
v=opt_med9(a);
}
/* partitioning scheme with pivot as a value (v) */
// v is pivot (for this implementation this does not have to be a value thats actually in the array, but it should be in the range of values in the array !
/* this version uses pointers rather than array indices - this might be simpler for a compiler to optimise ? and it also avoids issues with unsigned values going to "-1" */
/* with TDM-gcc 10.3.0 this is about 2% faster on the test program than a version using array indices */
elem_type_sort2 *pi,*pj;// 2 pointers to replace indices i,j
pi=x-1;// one before start as loop below starts with pi++
pj=x+n; // one after end as loop below starts pj--
for (;;)
{
do pi++; while (pj >= pi && *pi < v); // was i < n
do pj--; while (pj >= pi && *pj > v);
if (pj < pi) break;
//eswap2(pi-x,pj-x, x, y); // still uses indices
eswap2p(pi,pj, x, y);
}
pj=pi;// now look for multiple values equal to v adjacent to pivot position (pi)
if(*pi==v)
{
while(pi<x+n-1 && pi[1]==v) ++pi;
while(pj>x && pj[-1]==v) --pj;
}
#ifdef PAR_SORT
/* if using parallel tasks check here to see if task spawned from this function has finished, if so we can spawn another one to keep it busy
We allow spawned tasks and recursive calls to spawn more tasks if we have enough processors (thats the (nos_p)/2 passed as a paramater)
This approach should scale reasonably well without needing any complex interactions betweens tasks (as they are all working on seperate portions of the arrays x & y)
With 1 processor everything is in main function.
With 2 processors we use both
With 4 processors we use 3 [ as (4)/2=2, then 2/2=1 , so we use main processor and spawn 2 threads]
With 8 (logical) processors we use 7 [ (8)/2=4, (4)/2=2, then 2/2= 1 so we use main processor + 2 threads + 4 threads = 7 in total ]
In terms of run time - using a processor with 8 logical cores (4 physical cores) available:
1 core (PAR_SORT not defined) - test program sorting largest size took 52.662 secs
4 cores 21.767 secs = 2.4* speedup
8 cores 19.350 secs = 2.7* speedup (there were only 4 physical cores which may partly explain the reduced improvement)
*/
if(th!=NULL && n>2*PAR_MIN_N )
{// if thread active and partition big enough that we might be able to use a parallel task (2* as we will at least halve the size of the partition for the parallel task)
#ifdef USE_PTHREADS
if( params.task_fin==1 )
{// if thread has finished
pthread_join(thread_id,NULL);
th=NULL; // set to NULL so we can reuse it
// dprintf("Thread finished within function processing size=%llu nos_p=%d\n",n,nos_p);
}
#else
if( params.task_fin==1 && WaitForSingleObject( th, 0 )!=WAIT_TIMEOUT)
{// if thread has finished - using params.task_fin==1 might be more efficient than always calling WaitForSingleObject() - but anyway it makes porting to pthreads simpler
CloseHandle( th );// Destroy the thread object.
th=NULL; // set to NULL so we can reuse it
// dprintf("Thread finished within function processing size=%llu nos_p=%d\n",n,nos_p);
}
#endif
}
#endif
if((size_t)(pj-x)<n-(size_t)(pi-x)) // j<n-i
{
#ifdef PAR_SORT
if( th==NULL && nos_p>1 && (size_t)(pj-x) > n/PAR_DIV_N && (size_t)(pj-x) > PAR_MIN_N)
{// use a worker thread last 2 tests check the overhead of thread creation is worth it.
params.xp=x;
params.yp=y;
params.np=(size_t)(pj-x);
params.nos_p_p=(nos_p)/2; // if we still have spare processors allow more threads to be started
params.task_fin=0; // task has not yet finished
#ifdef USE_PTHREADS
if(pthread_create(&thread_id,NULL,yasortThreadFunc,¶ms)==0)
{th=&thread_id; // success
}
else
{th=NULL; // failed to run task
}
#else /* use native Windows threads */
th=(HANDLE)(uintptr_t)_beginthreadex(NULL,0,yasortThreadFunc,¶ms,0,NULL);
#endif
if(th==NULL) _yasort2(x,y, (size_t)(pj-x),0); // if starting thread fails then do in this process. nos_p=0 so don't run any tasks from here
}
else
{_yasort2(x,y, (size_t)(pj-x),(nos_p)/2); // recurse for smalest partition so stack depth is bounded at O(log2(n)). Allow more threads from subroutine if we still have some processors spare
assert(check_sort(x,(size_t)(pj-x)));// check sort worked correctly
}
#else
_yasort2(x,y, pj-x,0); // recurse for smalest partition so stack depth is bounded at O(log2(n))
assert(check_sort(x,pj-x));// check sort worked correctly
#endif
y+=pi-x;
n-=(size_t)(pi-x);
x=pi;
}
else
{
#ifdef PAR_SORT
if( th==NULL && nos_p>1 && (n-(size_t)(pi-x)) > n/PAR_DIV_N && (n-(size_t)(pi-x)) > PAR_MIN_N)
{// use a worker thread
params.xp=pi;
params.yp=y+(pi-x);
params.np=n-(size_t)(pi-x);
params.nos_p_p=(nos_p)/2;
params.task_fin=0; // task has not yet finished
#ifdef USE_PTHREADS
if(pthread_create(&thread_id,NULL,yasortThreadFunc,¶ms)==0)
{th=&thread_id; // success
}
else
{th=NULL; // failed to run task
}
#else /* use native Windows threads */
th=(HANDLE)(uintptr_t)_beginthreadex(NULL,0,yasortThreadFunc,¶ms,0,NULL);
#endif
if(th==NULL) _yasort2(pi,y+(pi-x), n-(size_t)(pi-x),0); // if starting thread fails then do in this process. nos_p=0 so don't run any tasks from here
}
else
{ _yasort2(pi,y+(pi-x), n-(size_t)(pi-x),(nos_p)/2); // recurse for smalest partition so stack depth is bounded at O(log2(n))
assert(check_sort(pi,(size_t)(n-(size_t)(pi-x)))); // check sort worked correctly
}
#else
_yasort2(pi,y+(pi-x), n-(size_t)(pi-x),0);
assert(check_sort(pi,n-(pi-x))); // check sort worked correctly
#endif
n=(size_t)(pj-x);
}
} // end while(1)
sortend: ; // need a common end point as may be using threads in which case we need to wait for them to complete
#ifdef PAR_SORT
if(th!=NULL)
{// if a thread used and its still running, need to wait for it to finish
#ifdef USE_PTHREADS
pthread_join(thread_id,NULL);
#else /* using native Windows threads */
WaitForSingleObject( th, INFINITE );
// Destroy the thread object.
CloseHandle( th );
#endif
}
#endif
return;
}
void yasort2(elem_type_sort2 *x,elem_type_sort2 *y, size_t n) /* user function */
{
#ifdef PAR_SORT
unsigned int nos_p=nos_procs() ;// total number of (logical) processors available
_yasort2(x,y, n,nos_p); /* call main worker function */
#else
_yasort2(x,y, n,1); /* call main worker function , with only 1 processor (as NOS_PAR not defined ) */
#endif
}
#ifdef YA2SORT_TEST_PROGRAM /* test program required, times execution - while doing a set of benchmark tests for functionality */
#include "hr_timer.h"
#include <math.h>
#include "yamedian.h" // we also test median code, so need header for that as well
static size_t dataLen=100000001; /* should normally be 100000001 (needs this for median of 3 killer pattern) and elem_type_sort2 should be double for most tests */
#define NOS_TESTS 3 /* number of runs of median() to time [normally 3 - we use median of this many runs ]*/
#define TIMES_IGNORE 0 /* ignore this many longest runtimes [ normally 0 - median time which rejects outliers so this has little use now ] */
/* use the #if chain below to select what you want to check/benchmark. */
#if 1 /* test ya2sort */
// void qsort2(elem_type_sort2 *a,elem_type_sort2 *b, int n)
#define MEDIAN_NAME "yasort2-2 partition " // name of algorithm we are testing
#define median(a,b,s) (yasort2(a,b,s),a[(s-1)/2]) // function call to test
#define IS_SORT /* method sorts data */
#else /* test heapsort2 */
#define MEDIAN_NAME "heapsort2" // name of algorithm we are testing
#define median(a,b,s) (heapsort2(a,b,s),a[(s-1)/2]) // function call to test
#define IS_SORT /* method sorts data */
#endif
/* insertion sort for small arrays of doubles (used in test program below) */
static inline void dbl_ins_sort( double *x, const size_t n)
{
if(n<2) return;// if length 1 then already sorted
for (double *p=x; p<x+n-1; ++p)
{
double t = p[1];
double *j = p;
if(*j>t)
{// out of order
do
{j--;
} while(j>=x && *j>t);
memmove(j+2,j+1,(p-j)*sizeof(double));// move a portion of array x right by 1 to make space for t