diff --git a/About.cpp b/About.cpp
index dd7b8c3..1b09ab6 100644
--- a/About.cpp
+++ b/About.cpp
@@ -1,5 +1,5 @@
 /*----------------------------------------------------------------------------
- * Copyright (c) 2019 Peter Miller
+ * Copyright (c) 2019,2022 Peter Miller
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
diff --git a/UDataPlotWindow.cpp b/UDataPlotWindow.cpp
index d468db0..56800aa 100644
--- a/UDataPlotWindow.cpp
+++ b/UDataPlotWindow.cpp
@@ -48,10 +48,13 @@
 //               : all filters now report progress as a % (previoulsy min abs error and min rel error did not report progress and they could be quite slow).
 //               : skip N lines before csv header option added for cases where csv header is not on the 1st row of the file
 //               : Added column numbers to X column and Y column listboxes  to make it easier to select columns when names are not very descriptive (or missing).
+// 2v3 3/1/2022  : can optionally use yasort2() for sorting - this has a guarantee on worse case execution time and can use all available processors to speed up sorting
+//               : yamedian() used which can calculate median in place without needing to copy the array of y values (but will for speed if memory is available)
+//               : added linear regression y=m*x*log2(x)+c
 //
 //---------------------------------------------------------------------------
 /*----------------------------------------------------------------------------
- * Copyright (c) 2019,2020,2021 Peter Miller
+ * Copyright (c) 2019,2020,2021,2022 Peter Miller
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
@@ -107,7 +110,7 @@
 #include "multiple-lin-reg-fn.h"
 
 extern TForm1 *Form1;
-const char * Prog_Name="CSVgraph (Github) 2v2g";   // needs to be global as used in about box as well.
+const char * Prog_Name="CSVgraph (Github) 2v3";   // needs to be global as used in about box as well.
 #if 1 /* if 1 then use fast_strtof() rather than atof() for floating point conversion. Note in this application this is only slightly faster (1-5%) */
 extern "C" float fast_strtof(const char *s,char **endptr); // if endptr != NULL returns 1st character thats not in the number
 #define strtod fast_strtof  /* set so we use it in place of strtod() */
@@ -2111,54 +2114,59 @@ void __fastcall TPlotWindow::Button_add_trace1Click(TObject *Sender)
 				pScientificGraph->fnLinreg(SqrtLin,iGraph,filter_callback);
 				break;
 		case 16:
+				// nlog2(n) regression y=m*x*log2(x)+c
+				StatusText->Caption=FString;
+				pScientificGraph->fnLinreg(Nlog2nLin,iGraph,filter_callback);
+				break;
+		case 17:
 				// y=a*x+b*sqrt(x)+c  (least squares fit)
 				StatusText->Caption=FString;
 				pScientificGraph->fnLinreg_3(iGraph,filter_callback);
 				break;
 
-		case 17:
+		case 18:
 				// y=a+b*sqrt(x)+c*x+d*x^1.5
 				StatusText->Caption=FString;
 				gen_lin_reg(reg_sqrt,4,true,iGraph);
 				break;
-		case 18:
+		case 19:
 				// y=(a+bx)/(1+cx)  (least squares fit)
 				StatusText->Caption=FString;
 				pScientificGraph->fnrat_3(iGraph,filter_callback);
 				break;
-		case 19:
+		case 20:
 				// N=5=> y=(a0+a1*x+a2*x^2)/(1+b1*x+b2*x^2)
 				StatusText->Caption=FString;
 				gen_lin_reg(reg_rat,5,true,iGraph);
 				break;
-		case 20: //  general purpose polynomial fit   (least squares using orthogonal polynomials)
+		case 21: //  general purpose polynomial fit   (least squares using orthogonal polynomials)
 				StatusText->Caption=FString;
 				if(!pScientificGraph->fnPolyreg(poly_order,iGraph,filter_callback))
 						{StatusText->Caption="Polynomial fit failed";
 						 ShowMessage("Warning: Polynomial fit failed - adding original trace to graph");
 						}
 				break;
-		case 21:
+		case 22:
 				// general purpose polynomial fit in sqrt(x) with user defined order
 				StatusText->Caption=FString;
 				gen_lin_reg(reg_sqrt,poly_order+1,true,iGraph);
 				break;
-		case 22:
+		case 23:
 				// rational fit (poly1/poly2)  with user defined order
 				StatusText->Caption=FString;
 				gen_lin_reg(reg_rat,poly_order+1,true,iGraph);
 				break;
-		case 23:
+		case 24:
 				// derivative
 				StatusText->Caption=FString;
 				deriv_trace(iGraph);
 				break;
-		case 24:
+		case 25:
 				// integral
 				StatusText->Caption=FString;
 				integral_trace(iGraph);
 				break;
-		case 25: // bool TScientificGraph::fnFFT(bool dBV_result,bool hanning,int iGraphNumberF, void (*callback)(unsigned int cnt,unsigned int maxcnt))
+		case 26: // bool TScientificGraph::fnFFT(bool dBV_result,bool hanning,int iGraphNumberF, void (*callback)(unsigned int cnt,unsigned int maxcnt))
 				// fft return ||
 				StatusText->Caption=FString;
 				if(!pScientificGraph->fnFFT(false,false,iGraph,filter_callback))
@@ -2166,7 +2174,7 @@ void __fastcall TPlotWindow::Button_add_trace1Click(TObject *Sender)
 						 ShowMessage("Warning: FFT failed - adding original trace to graph");
 						}
 				break;
-		case 26: // bool TScientificGraph::fnFFT(bool dBV_result,bool hanning,int iGraphNumberF, void (*callback)(unsigned int cnt,unsigned int maxcnt))
+		case 27: // bool TScientificGraph::fnFFT(bool dBV_result,bool hanning,int iGraphNumberF, void (*callback)(unsigned int cnt,unsigned int maxcnt))
 				// fft return dBV
 				StatusText->Caption=FString;
 				if(!pScientificGraph->fnFFT(true,false,iGraph,filter_callback))
@@ -2174,7 +2182,7 @@ void __fastcall TPlotWindow::Button_add_trace1Click(TObject *Sender)
 						 ShowMessage("Warning: FFT failed - adding original trace to graph");
 						}
 				break;
-		case 27: // bool TScientificGraph::fnFFT(bool dBV_result,bool hanning,int iGraphNumberF, void (*callback)(unsigned int cnt,unsigned int maxcnt))
+		case 28: // bool TScientificGraph::fnFFT(bool dBV_result,bool hanning,int iGraphNumberF, void (*callback)(unsigned int cnt,unsigned int maxcnt))
 				// fft with Hanning window, return ||
 				StatusText->Caption=FString;
 				if(!pScientificGraph->fnFFT(false,true,iGraph,filter_callback))
@@ -2182,7 +2190,7 @@ void __fastcall TPlotWindow::Button_add_trace1Click(TObject *Sender)
 						 ShowMessage("Warning: FFT failed - adding original trace to graph");
 						}
 				break;
-		case 28: // bool TScientificGraph::fnFFT(bool dBV_result,bool hanning,int iGraphNumberF, void (*callback)(unsigned int cnt,unsigned int maxcnt))
+		case 29: // bool TScientificGraph::fnFFT(bool dBV_result,bool hanning,int iGraphNumberF, void (*callback)(unsigned int cnt,unsigned int maxcnt))
 				// fft with Hanning window return dBV
 				StatusText->Caption=FString;
 				if(!pScientificGraph->fnFFT(true,true,iGraph,filter_callback))
@@ -2194,20 +2202,20 @@ void __fastcall TPlotWindow::Button_add_trace1Click(TObject *Sender)
   if(*ys!=',')
 	{// if this is the final trace then rescale & actually plot, otherwise skip this step to save time
 	 if(iGraph==0 || !zoomed)
-        { // if 1st graph or not already zoomed then autoscale, otherwise leave this to the user.
+		{ // if 1st graph or not already zoomed then autoscale, otherwise leave this to the user.
 		 StatusText->Caption="Autoscaling";
-         Application->ProcessMessages(); /* allow windows to update (but not go idle) */
-         pScientificGraph->fnAutoScale();
-        }
-     StatusText->Caption="Drawing graph";
-     Application->ProcessMessages(); /* allow windows to update (but not go idle) */
-     fnReDraw();
-    }
+		 Application->ProcessMessages(); /* allow windows to update (but not go idle) */
+		 pScientificGraph->fnAutoScale();
+		}
+	 StatusText->Caption="Drawing graph";
+	 Application->ProcessMessages(); /* allow windows to update (but not go idle) */
+	 fnReDraw();
+	}
   end_t=clock();
 
   if(*ys==',')
-        {// multiple items are comma seperated
-         ++ys; // skip ,
+		{// multiple items are comma seperated
+		 ++ys; // skip ,
 		 rewind(fin); // back to start of file
 		 char *text_first_line=NULL;
 		 {int skip_initial_lines=_wtoi(Form1->pPlotWindow->Edit_skip_lines->Text.c_str());
@@ -3075,3 +3083,4 @@ void __fastcall TPlotWindow::Action1Execute(TObject *Sender)
 
 
 
+
diff --git a/UDataPlotWindow.dfm b/UDataPlotWindow.dfm
index 894d531..29666b7 100644
--- a/UDataPlotWindow.dfm
+++ b/UDataPlotWindow.dfm
@@ -30,7 +30,7 @@ object PlotWindow: TPlotWindow
   OnResize = FormResize
   DesignSize = (
     1290
-    982)
+    1002)
   PixelsPerInch = 96
   TextHeight = 13
   object Panel1: TPanel
@@ -705,18 +705,19 @@ object PlotWindow: TPlotWindow
         'Median Filter'
         'Median1 Filter'
         'Linear Filter order:'
-        'Lin.regression (y=mx)'
-        'Lin. regression (y=mx+c)'
-        'GMR regression (y=mx+c)'
+        'Lin.regression: y=mx'
+        'Lin. regression: y=mx+c'
+        'GMR regression: y=mx+c'
         'min abs err: y=mx+c'
         'min rel err: y=mx+c'
-        'Log (y=m*log(x)+c)'
-        'Exponential y=c*exp(mx)'
-        'Power y=c*x^m'
-        'Recip (y=m/x+c)'
+        'Log: y=m*log(x)+c'
+        'Exponential: y=c*exp(mx)'
+        'Power: y=c*x^m'
+        'Recip: y=m/x+c'
         'y=1/(mx+c)'
-        'Hyperbolic'
-        'sqrt y=m*sqrt(x)+c'
+        'Hyperbolic: y=x/(m+c*x)'
+        'sqrt: y=m*sqrt(x)+c'
+        'y=m*x*log2(x)+c'
         'y=a*x+b*sqrt(x)+c'
         'y=a+bx^0.5+cx+dx^1.5'
         'y=(a+bx)/(1+cx)'
@@ -934,7 +935,7 @@ object PlotWindow: TPlotWindow
     Left = 304
     Top = 32
     Bitmap = {
-      494C01010A000E00880110001000FFFFFFFFFF10FFFFFFFFFFFFFFFF424D3600
+      494C01010A000E00A40110001000FFFFFFFFFF10FFFFFFFFFFFFFFFF424D3600
       0000000000003600000028000000400000003000000001002000000000000030
       0000000000000000000000000000000000000000000000000000000000000000
       0000000000000000000000000000000000000000000000000000000000000000
diff --git a/UScalesWindow.cpp b/UScalesWindow.cpp
index 4cececd..61b2ffd 100644
--- a/UScalesWindow.cpp
+++ b/UScalesWindow.cpp
@@ -4,7 +4,7 @@
 //  Loosely based on an original public domain version by  Frank heinrich, mail@frank-heinrich.de
 //---------------------------------------------------------------------------
 /*----------------------------------------------------------------------------
- * Copyright (c) 2019 Peter Miller
+ * Copyright (c) 2019,2022 Peter Miller
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
diff --git a/UScientificGraph.cpp b/UScientificGraph.cpp
index c046314..0c62a40 100644
--- a/UScientificGraph.cpp
+++ b/UScientificGraph.cpp
@@ -7,7 +7,7 @@
 // 6/2/2021 for 2v0 major change to use float *x_vals,*y_vals rather than SDataPoints in a TLIST.
 //
 /*----------------------------------------------------------------------------
- * Copyright (c) 2019 Peter Miller
+ * Copyright (c) 2019,2022 Peter Miller
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
@@ -45,6 +45,8 @@
 #include <cmath>
 #include "kiss_fftr.h" // for fft
 #include "_kiss_fft_guts.h"
+#include "yasort2.h" /* needs to be set so we sort float's ie include "#define elem_type_sort2 float" */
+#include "yamedian.h" /* needs to be set to work on floats eg  #define elem_type_median float */
 
 // #define USE_double_to_str_exp /* define to use double_to_str_exp() for y axis with max 12 chars, if not defined use gcvt() */
 
@@ -1227,17 +1229,21 @@ void TScientificGraph::fnMedian_filt(unsigned int median_ahead, int iGraphNumber
 
                         }
                  if(i==0)
-                        {// do exact median by taking a copy of the values, and using quick_select() to find the median
-                         float *arr=(float *)calloc(endi,sizeof(float));
-                         if(arr!=NULL)
-                                {
-                                 for(unsigned int k=0; k<endi;++k)   // take a copy of all the y values that we want median of, also get min,max,average
-                                        {
+						{
+#if 1
+						 f=ya_median(pAGraph->y_vals,endi); // calculate median in place (don't change arr).
+#else                    // do exact median by taking a copy of the values, and using quick_select() to find the median
+						 float *arr=(float *)calloc(endi,sizeof(float));
+						 if(arr!=NULL)
+								{
+								 for(unsigned int k=0; k<endi;++k)   // take a copy of all the y values that we want median of, also get min,max,average
+										{
 										 arr[k]= pAGraph->y_vals[k];
-                                        }
-                                 f=quick_select( arr, endi); // initialise with actual median as its quick to calculate
-                                free(arr);
-                               }
+										}
+								 f=quick_select( arr, endi); // initialise with actual median as its quick to calculate
+								free(arr);
+							   }
+#endif
                         }
                  last_endi=endi; // ensure we don't check values we have already processed
                  if(endi>=maxi)
@@ -1315,7 +1321,10 @@ void TScientificGraph::fnMedian_filt(unsigned int median_ahead, int iGraphNumber
 				 this_endT=pAGraph->x_vals[i]+median_ahead_t; // time for the end of the look ahead
 				 for(endi=last_endi;endi<maxi && pAGraph->x_vals[endi] < this_endT;++endi); // search forward to find i that matches end of look ahead time
 				 if(i==0)
-						{// do exact median by taking a copy of the values, and using quick_select() to find the median
+						{
+#if 1
+						 m=ya_median(pAGraph->y_vals,endi); // calculate median in place (don't change arr).
+#else                    // do exact median by taking a copy of the values, and using quick_select() to find the median
 						 float *arr=(float *)calloc(endi,sizeof(float));
 						 if(arr!=NULL)
 								{
@@ -1326,6 +1335,7 @@ void TScientificGraph::fnMedian_filt(unsigned int median_ahead, int iGraphNumber
 								 m=quick_select( arr, endi); // initialise with actual median as its quick to calculate
 								free(arr);
 							   }
+#endif
 						}
 				 last_endi=endi; // ensure we don't check values we have already processed
 				 if(endi>=maxi)
@@ -1559,7 +1569,7 @@ void TScientificGraph::fnrat_3(int iGraphNumberF, void (*callback)(unsigned int
 void TScientificGraph::fnLinreg(enum LinregType type, int iGraphNumberF, void (*callback)(unsigned int cnt,unsigned int maxcnt))
  // apply 1st order least squares linear regression (y=mx+c) to graph in place
  // can also do GMR for a straight line when type = LinLin_GMR
- // enum LinregType  {LinLin,LogLin,LinLog,LogLog,RecipLin,LinRecip,RecipRecip,SqrtLin}; defines preprocessing of varibales before linear regression 1st is X 2nd is Y
+ // enum LinregType  {LinLin,LogLin,LinLog,LogLog,RecipLin,LinRecip,RecipRecip,SqrtLin}; defines preprocessing of variables before linear regression 1st is X 2nd is Y
  // results checked using csvfun3.csv. R^2 values (and coefficients) also checked against Excel for the fits excel can do.
 {// to save copying data this is done inline
  SGraph *pAGraph = ((SGraph*) pHistory->Items[iGraphNumberF]);
@@ -1597,6 +1607,11 @@ void TScientificGraph::fnLinreg(enum LinregType type, int iGraphNumberF, void (*
 			{if(xi<0 ) continue; // avoid sqrt of a negative number
 			 xi=sqrt(xi);
 			}
+		 if(type== Nlog2nLin)
+			{if(xi<=0 ) continue; // avoid ln of a negative number
+			 // M_LOG2E 1.44269504088896340736 is defined in math.h // 1/ln(2) as multiplication below is probably faster than division
+			 xi=xi*log(xi)*M_LOG2E;    // xi*log2(xi) = xi*ln(xi)/ln(2)
+			}
 		 ++N;
          meanx+= (xi-meanx)/(double) N; /* calculate means as mi+1=mi+(xi+1 - mi)/i+1 , this should give accurate results and avoids the possibility of the "sum" overflowing*/
          meany+= (yi-meany)/(double) N;
@@ -1676,6 +1691,11 @@ void TScientificGraph::fnLinreg(enum LinregType type, int iGraphNumberF, void (*
 		// sqrt(x): y=m*sqrt(x)+c
 		rprintf("Best Least squares curve is Y=%g*sqrt(X)%+g which has an R^2 of %g\n",m,c,r2);
 		break;
+	 case Nlog2nLin:
+		// n*log2(n): y=m*x*log2(x)+c
+		rprintf("Best Least squares curve is Y=%g*X*log2(X)%+g which has an R^2 of %g\n",m,c,r2); // log base 2
+		rprintf("Best Least squares curve is Y=%g*X*log(X)%+g\n",m*M_LOG2E,c); // log base e
+		break;
 	}
  // now put new y values back, calculated as y=m*x+c
  for(i=0;i<iCount;++i)
@@ -1719,6 +1739,10 @@ void TScientificGraph::fnLinreg(enum LinregType type, int iGraphNumberF, void (*
 			// sqrt(x): y=m*sqrt(x)+c
 			pAGraph->y_vals[i]=m*(xi>=0?sqrt(xi):0.0f)+c;
 			break;
+		 case Nlog2nLin:
+			// n*log2(n): y=m*x*log2(x)+c
+			pAGraph->y_vals[i]=m*(xi>0?xi*log(xi)*M_LOG2E:0.0f)+c;
+			break;
 		}
 	   e=fabs(yi-pAGraph->y_vals[i]);
 	   if(e>maxe) maxe=e;
@@ -2741,12 +2765,19 @@ void TScientificGraph::myqsort(int iGraphNumberF, int p, int r)
 #undef my_swapc
 #undef Z
 
-
 void TScientificGraph::sortx( int iGraphNumberF) // sort ordered on x values  (makes x values increasing)
 {
  SGraph *pAGraph = ((SGraph*) pHistory->Items[iGraphNumberF]);
  unsigned int iCount=pAGraph->nos_vals ;
  time_t start_t=clock();
+#if 1
+  /* sort using yasort2() */
+
+ float *xa=pAGraph->x_vals;
+ float *ya=pAGraph->y_vals;
+ yasort2(xa,ya,iCount);
+ rprintf(" sort (yasort2()) completed in %g secs\n",(clock()-start_t)/(double)CLOCKS_PER_SEC);
+#else /* original sorting code */
 #ifdef CHECK_DEPTH
  maxdepth=0;
  rn=3103515245u; // initialise random number generator to the same value every time to get consistant sorts for the same data
@@ -2763,6 +2794,7 @@ void TScientificGraph::sortx( int iGraphNumberF) // sort ordered on x values  (m
    pAGraph->x_vals[0],pAGraph->x_vals[1],pAGraph->x_vals[2],pAGraph->x_vals[3],
    pAGraph->x_vals[4],pAGraph->x_vals[5]);
 #endif
+#endif
 #if 1  /* check array actually is sorted correctly */
  int errs=0;
 
diff --git a/UScientificGraph.h b/UScientificGraph.h
index 40ffc5d..85aaed9 100644
--- a/UScientificGraph.h
+++ b/UScientificGraph.h
@@ -9,7 +9,7 @@
 #include <Graphics.hpp>
 // #define CHECK_DEPTH /* if defined check depth of recursion in myqsort() */
 
-enum LinregType  {LinLin,LinLin_GMR,LogLin,LinLog,LogLog,RecipLin,LinRecip,RecipRecip,SqrtLin};
+enum LinregType  {LinLin,LinLin_GMR,LogLin,LinLog,LogLog,RecipLin,LinRecip,RecipRecip,SqrtLin,Nlog2nLin};
 
 // class for scientific plots
 
@@ -149,6 +149,7 @@ class TScientificGraph
   bool fnAddDataPoint(float dXValueF, float dYValueF,
 	   int iGraphNumberF = 0); // returns true if added OK else false
   float fnAddDataPoint_nextx(int iGraphNumberF);    // returns next x value for this graph assuming its the same as the previous graph
+  float fnAddDataPoint_thisy(int iGraphNumber);    // returns next y value of iGraphNumber (locn from current graph number)  used to do $T1
   bool fnChangeXoffset(double dX); // change all X values by adding dX to the most recently added graph if at least 2 graphs defined
   void fnMedian_filt(unsigned int median_ahead, int iGraphNumberF = 0); // apply median filter to graph in place , lookahead defined in samples
   void fnMedian_filt_time1(double median_ahead_t, int iGraphNumberF, void (*callback)(unsigned int cnt,unsigned int maxcnt)); // new algorithm apply median filter to graph in place  , lookahead defined in time
diff --git a/Unit1.cpp b/Unit1.cpp
index 50edb09..86519de 100644
--- a/Unit1.cpp
+++ b/Unit1.cpp
@@ -4,7 +4,7 @@
 //  Loosely based on an original public domain version by  Frank heinrich, mail@frank-heinrich.de
 //---------------------------------------------------------------------------
 /*----------------------------------------------------------------------------
- * Copyright (c) 2019 Peter Miller
+ * Copyright (c) 2019.2022 Peter Miller
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
diff --git a/atof.c b/atof.c
index d5cb857..ba1ebc8 100644
--- a/atof.c
+++ b/atof.c
@@ -15,7 +15,7 @@
 /*----------------------------------------------------------------------------
  * MIT License:
  *
- * Copyright (c) 2020 Peter Miller
+ * Copyright (c) 2020,2022 Peter Miller
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
diff --git a/csvgraph.cbproj b/csvgraph.cbproj
index 7033d45..2774243 100644
--- a/csvgraph.cbproj
+++ b/csvgraph.cbproj
@@ -160,9 +160,18 @@
             <DependentOn>UScientificGraph.h</DependentOn>
             <BuildOrder>10</BuildOrder>
         </CppCompile>
-        <FormResources Include="UDataPlotWindow.dfm"/>
+        <CppCompile Include="ya-median.c">
+            <BuildOrder>15</BuildOrder>
+        </CppCompile>
+        <CppCompile Include="ya-select.c">
+            <BuildOrder>14</BuildOrder>
+        </CppCompile>
+        <CppCompile Include="ya-sort2.c">
+            <BuildOrder>13</BuildOrder>
+        </CppCompile>
         <FormResources Include="Unit1.dfm"/>
         <FormResources Include="UScalesWindow.dfm"/>
+        <FormResources Include="UDataPlotWindow.dfm"/>
         <FormResources Include="About.dfm"/>
         <BuildConfiguration Include="Release">
             <Key>Cfg_2</Key>
diff --git a/csvgraph.cbproj.local b/csvgraph.cbproj.local
index 461d4f3..c82976b 100644
--- a/csvgraph.cbproj.local
+++ b/csvgraph.cbproj.local
@@ -7,7 +7,10 @@
     <Transaction>2020/12/31 20:56:19.000.021,C:\PMi\builder-10v1-projects\csv-graph-github\csvgraphPCH1.h=</Transaction>
     <Transaction>2021/02/10 20:39:27.000.186,=C:\PMi\builder-10v1-projects\csv-graph-github\lc.c</Transaction>
     <Transaction>2021/02/14 14:59:05.000.076,C:\PMi\builder-10v1-projects\csv-graph-github\lc.c=</Transaction>
-    <Transaction>2021/03/14 17:14:03.000.255,=C:\PMi\builder-10v1-projects\csv-graph-github\matrix.cpp</Transaction>
     <Transaction>2021/03/14 17:14:03.000.241,=C:\PMi\builder-10v1-projects\csv-graph-github\multiple-lin-reg-fn.cpp</Transaction>
+    <Transaction>2021/03/14 17:14:03.000.255,=C:\PMi\builder-10v1-projects\csv-graph-github\matrix.cpp</Transaction>
+    <Transaction>2022/01/03 17:51:44.000.923,=C:\PMi\builder-10v1-projects\csv-graph-github\ya-sort2.c</Transaction>
+    <Transaction>2022/01/16 20:54:35.000.158,=C:\PMi\builder-10v1-projects\csv-graph-github\ya-select.c</Transaction>
+    <Transaction>2022/01/16 20:54:35.000.169,=C:\PMi\builder-10v1-projects\csv-graph-github\ya-median.c</Transaction>
   </Transactions>
 </BorlandProject>
diff --git a/csvgraph.cpp b/csvgraph.cpp
index 7847779..9581eb7 100644
--- a/csvgraph.cpp
+++ b/csvgraph.cpp
@@ -1,4 +1,28 @@
 //---------------------------------------------------------------------------
+/*----------------------------------------------------------------------------
+ * MIT License:
+ *
+ * Copyright (c) 2020,2022 Peter Miller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHOR OR COPYRIGHT HOLDER BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *--------------------------------------------------------------------------*/
 
 #include <vcl.h>
 #pragma hdrstop
diff --git a/csvgraph.docx b/csvgraph.docx
index 0198eba..ce4628f 100644
Binary files a/csvgraph.docx and b/csvgraph.docx differ
diff --git a/csvgraph.pdf b/csvgraph.pdf
index 1ca9a29..120f934 100644
Binary files a/csvgraph.pdf and b/csvgraph.pdf differ
diff --git a/csvgraph.stat b/csvgraph.stat
index 272ad00..9c090bb 100644
--- a/csvgraph.stat
+++ b/csvgraph.stat
@@ -1,10 +1,10 @@
 [Stats]
-EditorSecs=177299
-DesignerSecs=7005
-InspectorSecs=5813
-CompileSecs=9552114
-OtherSecs=22336
+EditorSecs=182146
+DesignerSecs=7121
+InspectorSecs=5862
+CompileSecs=10350932
+OtherSecs=23819
 StartTime=24/11/2020 19:51:59
 RealKeys=0
 EffectiveKeys=0
-DebugSecs=5524
+DebugSecs=5674
diff --git a/expr-code.cpp b/expr-code.cpp
index 7ff2b47..e90de28 100644
--- a/expr-code.cpp
+++ b/expr-code.cpp
@@ -3,7 +3,7 @@
 #pragma hdrstop
 /* expression handling code and other generally useful code for Builder C++ */
 /*----------------------------------------------------------------------------
- * Copyright (c) 2012, 2013 Peter Miller
+ * Copyright (c) 2012, 2013,2022 Peter Miller
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
diff --git a/interpolate.cpp b/interpolate.cpp
index 619784f..cf8b9e2 100644
--- a/interpolate.cpp
+++ b/interpolate.cpp
@@ -9,7 +9,7 @@
 
  */
 /*----------------------------------------------------------------------------
- * Copyright (c) 2012, 2013 Peter Miller
+ * Copyright (c) 2012, 2013,2022 Peter Miller
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
diff --git a/matrix.cpp b/matrix.cpp
index 102261c..f117bd0 100644
--- a/matrix.cpp
+++ b/matrix.cpp
@@ -8,7 +8,7 @@
 */
 
 /*----------------------------------------------------------------------------
- * Copyright (c) 2012, 2013 Peter Miller
+ * Copyright (c) 2012, 2013,2022 Peter Miller
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
diff --git a/multiple-lin-reg-fn.cpp b/multiple-lin-reg-fn.cpp
index f460ba6..e574f3a 100644
--- a/multiple-lin-reg-fn.cpp
+++ b/multiple-lin-reg-fn.cpp
@@ -14,7 +14,7 @@
 
 */
 /*----------------------------------------------------------------------------
- * Copyright (c) 2014 Peter Miller
+ * Copyright (c) 2014,2022 Peter Miller
  *
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
diff --git a/ya-median.c b/ya-median.c
new file mode 100644
index 0000000..714d7ad
--- /dev/null
+++ b/ya-median.c
@@ -0,0 +1,417 @@
+/* ya-median.c
+   ===============
+   This function returns the median of an array of numbers without changing the array.
+   If the order of items in the input array can be changed then yaMedian() (in ya-select.c) is approx 35% faster overall with my test program.
+   Type type of data used is defined in yamedian.h (elem_type_median).
+   
+   Where useful and possible it uses malloc to allocate an array with size up to the size of the input array.
+   When this fails it uses a slower approach, but it keeps trying to malloc smaller amounts of space.
+   No free space is required to calculate the median (apart from some stack space for local variables), but the more space that can be obtained from malloc the faster the result is obtained.
+   
+   The algorithm first does a pass of the data to find the min, max and to test for randomness
+   1 If the data is not random then a pass of the basic Torben algorithm is done as this can frequently find the median in 1 pass with no extra memory.
+   2 If the data is random the if malloc suceeds its used to take a copy of the input array and a fast select algorithm is used.
+   3 If the above fails then 2 iterations of the Torben algorithm are executed in an "unrolled" form in one pass of the input data. This may find the median with no extra memory.
+   	   This code uses a quadratic fit of the previous results to try and better estimate the median - this results in faster convergence.
+   4 If malloc now suceeds a copy of a subset of the array is taken and select used
+   5 loop back to step 3
+    
+ This software (and the underlying algorithm) was created by Peter Miller 2021, but it heavily leaverages work by others:
+    Torben Algorithm by Torben Mogensen, the original (public doamin) implementation was by N. Devillard.
+	yaselect() based on the paper "Fast Deterministic Selection" by Andrei Alexandrescu, 16th International Symposium on Experimental Algorithms (SEA 2017) 
+	Check for randomness is test 66 in "Statistical Tests" by Gpoal Kanji. 
+ 
+ The Torben algorithm uses the mid-range ( (max-min)/2 ) as the "pivot" value.
+https://en.wikipedia.org/wiki/Mid-range shows that the mid-range is a reasonable estimate of the mean, for example for a continuous uniform distribution the mid-range is the best estimator 
+of the mean, and for normal and laplace distributions it gives an unbiased estimate of the mean with a reasonably low variance.
+Calculating the actual mean, in a way that cannot overflow, is too slow to be useful here. 
+The mean is known to be within 1 standard deviation of the median - see Lemma 1 of "Fast Computation of the Median by Successive Binning", Ryan J. Tibshirani, October 2008.
+The range rule - https://www.thoughtco.com/range-rule-for-standard-deviation-3126231 states that the range=max-min divided by 4 is approximately the standard deviation.
+Thus using the mid-range as the pivot will "typically" place the pivot within range/4 of the median which would result in a typical o(n) run-time. 
+There are known worse case patterns for this eg:
+0,1,2,4,8,16,32,...
+ie 0,k^0, k^1,k^2,k^3 where k>=2 
+Here pivoting on the mid-range gives a new partition thats only 2 elements smaller, eg (32+0)/2=16 then (8+0)/2=4 then (2+0)/2=1
+However, in practice this sequence is limited by the type elem_type_median , eg if this is 32 bit unsigned integers then 2^32 is the largest value and this is only 32 values.
+Even using floating point values the exponent range is still very limited (a double only has 11 bits in its exponent so its limited to 2047).
+If the sequence is repeated then the reduction every time becomes geometric and the Torben algorithm becomes O(n) run-time.
+It is therefore postulated (but not proven) that with the Torben algorithm the worse case run-time is O(n).
+When there is enough free memory for a complete copy of the input array then the execution time is guaranteed to be O(n).
+ 
+ */
+ /* 
+Copyright (c) 2021,2022 Peter Miller
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+// #define DEBUG /* uncomment to get debug printf's 
+// you should not need to edit anything below here for normal use of this software (you may need to edit yamedian.h) 
+
+
+#ifdef DEBUG
+ #include <stdio.h>
+#endif
+#include <stddef.h> /* for size_t */
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "yamedian.h" /* defines elem_type_median etc */
+
+#define USE_MALLOC   /* if defined (as it normally should be) use malloc whenever possible. It its not defined the correct answer will still be found, but it may take longer to execute */
+
+elem_type_median ya_median(elem_type_median *m, const size_t n)
+{
+    elem_type_median  min, max;
+    min = max = m[0] ; 
+	 /* check for randomness is test 66 in "Statistical Tests" by Gpoal Kanji. */
+	 elem_type_median lastt=m[0];
+	 size_t nos_plus_signs=0;
+     for (elem_type_median *p=m+1; p<m+n; p++) 
+		{// first loop, just done once, to find min & max
+    	 elem_type_median t=*p;
+         if (t<min) min=t;
+         if (t>max) max=t; // else is OK as if we have a new min it cannot also be a new max - but I seem to get faster execution speed without the else !
+         if(t>lastt) nos_plus_signs++; // count of times sequence increases 
+         lastt=t;
+    	}
+	 /* 1.64*sd is 90% confidence limit */
+	 bool is_random=nos_plus_signs>=(n-1)/2-1.64*(n+1)/12 && nos_plus_signs<=(n-1)/2+1.64*(n+1)/12;
+#ifdef DEBUG	 
+     printf(" check randomness: nos_plus_signs=%llu for random expected %.0f to %.0f so is %s\n",nos_plus_signs,(n-1)/2-1.64*(n+1)/12, (n-1)/2+1.64*(n+1)/12, is_random?"random":"not random"); 
+#endif    
+
+     if(is_random)
+     	{ /* try to use select as thats faster when the input is random - but we must take a copy of the input array as we must not change that here */
+#ifdef USE_MALLOC     	
+ 		 elem_type_median *c,med; // c=copy of input array 
+ 		 c=malloc(sizeof(elem_type_median)*n);
+ 		 if(c!=NULL) // if space for a copy 
+ 		 	{
+#ifdef DEBUG 		 		
+			 printf("  Is random so using yaselect()\n");
+#endif			 
+			 memcpy(c,m,sizeof(elem_type_median)*n);
+ 			 yaselect(c,(n-1)/2,n);
+			 med=c[(n-1)/2];// median
+ 			 free(c); 
+ 			 return med;
+ 			}
+#endif			 	
+		}  
+	 else
+	 	{/* else do initial optimised Torben pass - in many cases this will quickly find the median especially as "is_random" is false here */
+	 	 size_t     less1, greater1, equal1;
+    	 elem_type_median  guess1, maxltguess1, mingtguess1;
+	 	 less1 = 0; greater1 = 0; equal1 = 0;		 	 
+	 	 if(max==min) 
+		 	return max; // only 1 value so no need to search any more
+     	 // guess = (min+max)/2;// guess does not have to be a value that actually exists, min and max are always actual values
+     	 guess1 = min/2+max/2; // (min+max)/2 could overflow, especially if elem_type_median was an integer type
+#ifdef DEBUG         
+	 	 printf("  Is not random, so using initial Torben pass: min=%.0f max=%.0f guess1=%.1f\n",min,max,guess1);
+#endif		 
+     	 maxltguess1 = min ;
+     	 mingtguess1 = max ;
+     	 /* don't count # equals, calculate it at the end as that might be a little faster */
+     	 for (elem_type_median *p=m; p<m+n; p++) 
+		 	{elem_type_median t=*p;
+    	  	 if (t<guess1) 
+		 		{
+    	     	 less1++;
+    	     	 if (t>maxltguess1) maxltguess1 = t;
+    	   		} 
+		  	 else if (t!=guess1) // ! < and != means > but != might be faster to calculate than > 
+				{
+             	 greater1++;
+             	 if (t<mingtguess1) mingtguess1 = t;
+           		}	 
+         	}
+     	 equal1=n-less1-greater1;            	
+     	 if (less1 <= (n+1)/2 && greater1 <= (n+1)/2) 
+	 		{// done just need to work out what to return
+    	 	 if (less1 >= (n+1)/2) return maxltguess1;
+    	 	 else if (less1+equal1 >= (n+1)/2) return guess1;
+    	 	 else return mingtguess1; 
+    		}
+     	 else if (less1>greater1) max = maxltguess1 ;
+     	 else min = mingtguess1;		
+		}
+  
+     int itns=0;
+	 elem_type_median  guess1=0,guess2=0,guess3=0;	 
+	 size_t     less=1,greater=1; // 1 as we initialise min=max=m[0] and then iterate from m[1]
+     size_t     less1=0, greater1, equal1;
+     elem_type_median  maxltguess1, mingtguess1;		 
+     size_t     less2=0, greater2, equal2;
+     elem_type_median  maxltguess2, mingtguess2;    
+     size_t     less3=0, greater3, equal3;
+     elem_type_median  maxltguess3, mingtguess3; 
+     elem_type_median  last_guess1,last_guess2,last_guess3;	 	 
+     while (1) 
+		{/* do a "binary search" to find median - we actually "unroll" the main loop so two iterations are done in parallel.
+		    This could take  0.5*log2(n) iterations of this while loop 
+		    Uses the results from the previous iteration fitted to a quadratic to better estimate the median - this speeds up its convergence
+		 */	 
+		 // greater values are calculated from less, equal outside of loop	 
+		 ++itns;// count loops done		 
+		 if(max==min) return max; // only 1 value so no need to search any more
+         // guess = (min+max)/2;// guess does not have to be a value that actually exists, min and max are always actual values
+		 last_guess1=guess1; last_guess2=guess2; last_guess3=guess3;
+         guess1 = (min/4)*3+(max/4); 
+         guess2 = min/2+max/2; // (min+max)/2 could overflow as could min+(max-min)/2 especially if elem_type_median was a signed integer type
+         guess3 = (min/4)+(max/4)*3; 
+		// if using integer maths then make estimates more accurate
+		if( ((elem_type_median)1)/2 == 0)
+			{// if we are using integer maths ; assume optimiser will optimise this out conmpletely if using floating point maths
+			 // printf("Int maths ");
+			 if( ((uint32_t)min&1) & ((uint32_t)max&1) ) // (uint32_t) is OK here as we are only looking at lsb so size of type does not really matter [ but without cast will not compile if elem_type_median is double]
+			 	{guess2++;// eg min=1 max=5 1/2+5/2 =0+2 = 2 but both min & max are odd so final value is 3 
+			 	}
+			 if(guess1<=min) guess1=min+1;// adjust guess1 & guess 3 as well - we know we have integer maths so +1 is OK
+			 if(guess3<=guess2) guess3=guess2+1;			 	
+			}		          
+
+	     if(itns>1)
+	     	{
+	     	 /* estimate median from results of previous pass - quadratic estimation */
+	  	 	 /* quadratic is y=a*x^2+b*x+c where y is the guessed median and x is the matching count */
+	  	 	 /* this is about 10% faster than linear estimation with the test program */
+			 long double a,b,c; // straight line fit - need good precision here as potentially subtracting big numbers that are very close
+			 elem_type_median best_guess;
+			 less1+=equal1; // count <=last_guess1
+			 less2+=equal2;
+			 less3+=equal3; // count <=last_guess3
+			 if(less1<less2 && less2<less3 && last_guess1<last_guess2 && last_guess2<last_guess3) // if there were results from a previous pass that allow a,b,c to be calculated
+			 	{a=((less3-less1)*(last_guess3-last_guess2)-(less3-less2)*(last_guess3-last_guess1)) /
+				   ( ((long double)less3*(long double)less3-(long double)less2*(long double)less2)*(less3-less1) - ((long double)less3*(long double)less3-(long double)less1*(long double)less1)*(less3-less2)  );
+				 b=(last_guess3-last_guess1-a*((long double)less3*(long double)less3-(long double)less1*(long double)less1))/(less3-less1) ;
+				 c=last_guess3-a*(long double)less3*(long double)less3-b*less3;
+	
+				 less2=(n+1)/2;// make sure calculated as an unsigned integer
+				 best_guess=a*(long double)less2*(long double)less2+b*(long double)less2+c; // want guess for (n+1)/2 = median location
+	#ifdef DEBUG 			 
+				 printf("  less1=%1.f last_guess1=%.1f less3=%.1f last_guess3=%.1f a=%g b=%g c=%g best_guess=%.1f min=%.1f max=%.1f\n",(double)less1,(double)last_guess1,(double)less3,(double)last_guess3,(double)a,(double)b,(double)c,(double)best_guess,(double)min,(double)max); 
+	#endif			 
+				 if(best_guess>min && best_guess<max)
+				 	{if(best_guess<guess2) 
+					 	{
+	#ifdef DEBUG 				 		
+					 	 printf("   guess1(was %.1f) changed to best_guess (%.1f) min=%.1f guess1 (best)=%.1f guess2=%.1f guess3=%.1f max=%.1f\n",(double)guess1,(double)best_guess,(double)min,(double)best_guess,(double)guess2,(double)guess3,(double)max);
+	#endif				 	 
+					 	 guess1=best_guess;
+					 	}
+				 	 else if(best_guess>guess2) 
+					  	{
+	#ifdef DEBUG 				  		
+						 printf("  guess3(was %.1f) changed to best_guess (%.1f) min=%.1f guess1=%.1f guess2=%.1f guess3(best)=%.1f max=%.1f\n",(double)guess3,(double)best_guess,(double)min,(double)guess1,(double)guess2,(double)best_guess,(double)max);				  		
+	#endif					 
+						 guess3=best_guess;
+					  	}
+				 	}
+			 	}   
+			}	
+ 		 less=0; // <=min
+		 less1 = 0 ; equal1 = 0;	
+		 less2 = 0;  equal2 = 0;	
+		 less3 = 0; equal3 = 0;	
+         if(guess1<min) guess1=min; // if we are using integer maths the above may not generate the values we expect, so fix that here (this should never be needed for float or double)
+         if(guess1>max) guess1=max;        
+         if(guess2<guess1) guess2=guess1;// it is OK for guess1==guess2 or gues2==guess3 (or them all to be the same) 
+         if(guess2>max) guess2=max;
+         if(guess3<guess2) guess3=guess2;
+         if(guess3>max) guess3=max;   
+		 // here we have min<=guess1<=guess2<=guess3<=max [ we know min!=max as thats trapped above ]      
+#ifdef DEBUG         
+		 printf("  min=%.0f max=%.0f guess1=%.1f guess2=%.1f guess3=%.1f\n",(double)min,(double)max,(double)guess1,(double)guess2,(double)guess3);
+#endif		 
+         maxltguess1=maxltguess2=maxltguess3 = min ;
+         mingtguess1=mingtguess2=mingtguess3 = max ;
+         for (elem_type_median *p=m; p<m+n; p++) 
+		 	{elem_type_median t=*p; // *** code below optimised based on knowledge that min<guess1<guess2<guess3<max . We also calculate greater1,2,3 but this is also done outside this loop
+		 	 if(t<guess2)
+		 	 	{ // guess 2 is the middle on so this approximately splits the array m in 1/2
+		 	 	 if(t<=min)
+		 	 	 	{less++; // just count values <=min - maxltguess & mingtguess are initialised to min/max so don't need to worry about these here
+		 	 	 	 continue;
+		 	 	 	}
+                 less2++;
+                 if (t>maxltguess2) maxltguess2 = t;		 	 	 	
+                 less3++;
+                 if (t>maxltguess3) maxltguess3 = t;
+                 
+				 if (t<guess1) 
+			 		{
+                 	 less1++;
+                 	 if (t>maxltguess1) maxltguess1 = t;
+                 	 continue;
+            		} 
+			 	 else if (t!=guess1) // here != means > 
+					{
+                 	 if (t<mingtguess1) mingtguess1 = t;
+                 	 continue;
+            		}	 
+			 	 else equal1++;
+			 	 continue;
+			 	}
+			 else if (t!=guess2) // t>guess2
+				{
+				 if(t>=max)
+				 	{// no need to count >=max as these are calculated from < and =. maxltguess & mingtguess are initialised to min/max so don't need to worry about these here
+				 	 continue;
+				 	}
+                 if (t<mingtguess2) mingtguess2 = t;
+                 if (t<mingtguess1) mingtguess1 = t;   
+             	 if (t<guess3) 
+			 		{
+                 	 less3++;
+                 	 if (t>maxltguess3) maxltguess3 = t;
+            		} 
+			 	 else if (t!=guess3) // here != means > 
+					{
+                 	 if (t<mingtguess3) mingtguess3 = t;
+            		} 
+			 	 else equal3++;	
+				 continue; // not realy needed but makes it clear					               
+            	} 
+			 else // t==guess2
+			 	{equal2++;
+			 	 if(t==guess3) equal3++; // for integers its possible guess2=guess3
+				 else 
+				 	{less3++;
+			 	 	 if (t>maxltguess3) maxltguess3 = t;
+			 		}
+			 	 if(t==guess1) equal1++; // for integers its possible guess2=guess3	
+			 	 else if (t<mingtguess1) mingtguess1 = t;
+			 	 continue; // not realy needed but makes it clear
+	 			}
+        	}
+    	 // now adjust values to get required numbers
+    	 less1+=less; less2+=less; less3+=less;
+    	 greater1=n-less1-equal1; // this should be more efficient than counting greater1,2,3
+    	 greater2=n-less2-equal2;
+    	 greater3=n-less3-equal3;
+      	 
+         if (less1 <= (n+1)/2 && greater1 <= (n+1)/2)
+		 	{// done 1
+#ifdef DEBUG
+    		 printf(" Torben(%.0f) [1] took %d iterations\n",(double)n,itns);	
+#endif
+    		 if(max==min) return max;
+    		 if (less1 >= (n+1)/2) return maxltguess1;
+    		 else if (less1+equal1 >= (n+1)/2) return guess1;
+    		 else return mingtguess1;		 	
+			}
+         else if (less1>greater1) 
+		 	{if(max > maxltguess1) 
+			 	{max = maxltguess1; // only narrow search range
+			 	 greater=greater1;
+			 	}
+		 	}
+         else 
+		 	{if(min < mingtguess1) 
+			 	{min = mingtguess1; // only narrow search range
+			 	 less=less1;
+			 	}
+		 	}
+         
+         if (less2 <= (n+1)/2 && greater2 <= (n+1)/2)
+		 	{// done 2
+#ifdef DEBUG
+    		 printf(" Torben(%.0f) [2] took %d iterations\n",(double)n,itns);	
+#endif
+    		 if(max==min) return max;
+    		 if (less2 >= (n+1)/2) return maxltguess2;
+    		 else if (less2+equal2 >= (n+1)/2) return guess2;
+    		 else return mingtguess2;		 	
+			}
+         else if (less2>greater2) 
+		 	{if(max > maxltguess2) 
+			 	{max = maxltguess2; // only narrow search range
+			 	 greater=greater2;
+			 	}
+		 	}
+         else 
+		 	{if(min < mingtguess2)
+			 	{ min = mingtguess2; // only narrow search range
+			 	 less=less2;
+			 	}
+		 	}
+		 
+         if (less3 <= (n+1)/2 && greater3 <= (n+1)/2)
+		 	{// done 3
+#ifdef DEBUG
+    		 printf(" Torben(%.0f) [3] took %d iterations\n",(double)n,itns);	
+#endif
+    		 if(max==min) return max;
+    		 if (less3 >= (n+1)/2) return maxltguess3;
+    		 else if (less3+equal3 >= (n+1)/2) return guess3;
+    		 else return mingtguess3;		 	
+			}
+         else if (less3>greater3) 
+		 	{if(max > maxltguess3) 
+			 	{max = maxltguess3; // only narrow search range
+			 	 greater=greater3;
+			 	}
+		 	}
+         else 
+		 	{if(min < mingtguess3) 
+			 	{min = mingtguess3; // only narrow search range
+			 	 less=less3;
+			 	}
+		 	}	
+#ifdef DEBUG		 	
+		 printf("  at end of iteration %u: less=%u greater=%u so number inbetween=%u\n",itns,(unsigned)less,(unsigned)greater,(unsigned)(n-less-greater));	
+#endif	
+
+#ifdef USE_MALLOC  	  
+		 if((n-less-greater)>1 )
+		 	{ // (n-less-greater) elements left. just collect these elements into an array and use select() on them rather than doing further iterations.	
+		 	  // each iteration reduces the number of elements left as min and max get closer to the actual median, so we expect malloc to eventually suceed.
+		 	 size_t buf_cnt=0,cnt_lt_min=0;
+			 elem_type_median *buf,med; // will allocate via malloc()
+			 size_t buf_size=(n-less-greater)+10; // allow a bit more space, just in case
+			 buf=malloc(sizeof(elem_type_median)*buf_size); // allocate space needed
+			 if(buf==NULL) continue; // not enought free ram - keep using Torben algorithm, on the next iteration the space needed will be less so can try again then.		 
+
+			 for (elem_type_median *p=m; p<m+n; p++) 
+			 	{elem_type_median t=*p;
+			 	 if(t<min) cnt_lt_min++;
+			 	 else if(t<=max && buf_cnt<buf_size)
+			 	 	{buf[buf_cnt++]=t;
+			 	 	}
+			 	}
+			 if(buf_cnt>=buf_size)
+			 	{
+#ifdef DEBUG			 		
+				 printf("  *** OOPS buf too small! \n");
+#endif				 
+				 free(buf);// need to free up space for buffer
+			 	 continue; // continue doing Torben, this should be very infrequent (never?) so don't need to worry about the extra time the wasted loop above took
+				}
+#ifdef DEBUG				
+			 printf("  Using yaselect(%u)\n",(unsigned)buf_cnt);	
+#endif			 
+			 yaselect(buf,(n+1)/2-cnt_lt_min -1,buf_cnt);
+			 med=buf[(n+1)/2-cnt_lt_min -1];	
+			 free(buf);// need to free up space for buffer
+			 return med;		 
+			}
+#endif			
+    	}
+ // not reached	 	
+}
diff --git a/ya-select.c b/ya-select.c
new file mode 100644
index 0000000..b1f92ca
--- /dev/null
+++ b/ya-select.c
@@ -0,0 +1,619 @@
+/*   ya-select.c
+	 ===========
+	 Fast selection & median functions.
+	 
+These functions all work on arrays of numbers.
+
+yaMedian() - returns the median of an array of numbers (the array is reordered)
+yaselect() - returns the n'th highest number from an array of numbers (the array is reordered)
+ya_msort() - sorts an array of numbers	 
+
+The algorith used is based on the paper "Fast Deterministic Selection" by Andrei Alexandrescu, 16th International Symposium on Experimental Algorithms (SEA 2017) 
+yaMedian() & yaselect() have guaranteed o(n) execution time by using the principles of introselect - see "Introspective sorting and selection algorithms" by D.R.Musser,Software practice and experience, 8:983-993, 1997.
+ya_msort() has a guaranteed O(n*log2(n)) execution time by using a quicksort with the pivot selected as the median using yaMedian().
+
+You select the type of the arrays used by defining elem_type_median in yamedian.h
+
+This implementation by Peter Miller 18/12/2021.
+
+*/
+/* 
+Copyright (c) 2021,2022 Peter Miller
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef __BORLANDC__
+ #define NDEBUG /* if defined then asserts become "nothing" */
+#endif
+// you should not need to edit anything below here for normal use of this software (you may need to edit yamedian.h)
+#ifdef DEBUG
+ #include <stdio.h>
+#endif 
+#include <stdint.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <string.h> /* includes memmove() */
+#include <assert.h>
+
+#include "yamedian.h"  /* defines elem_type_median etc */
+
+#define INTROMEDIAN_DIV 3 /* divisor  helps define when not enough progress is being made and we should swap to a more robust (but slower) algorithm */
+						  /* INTROMEDIAN_DIV must be >= 1, 1 means always use slower algorithm (after LIMIT iterations) */
+						  /*  even 1 and 10 for limit means that the slower algorithm is very rarely used so it has almost no impact on execution time with test program */
+#define INTROMEDIAN_LIMIT 10 /* the number of cumulative times the limit must be exceed before we swap to guaranteed (but slower) O(n) approach */
+
+#define cswap(i,j) {elem_type_median t;t=(i);i=(j);j=t;} // example call cswap(r[i], r[minIndex]); WARNING - side effects could be an issue here ! eg cswap(r[i++],r[b])
+
+#define tswap(i,j) {size_t t;t=(i);i=(j);j=t;} // example call tswap(a, b ); WARNING - side effects could be an issue here ! eg tswap(i++,b)
+
+
+inline static void eswap(size_t i,size_t j,elem_type_median *a) /* swap a[i]  and a[j] as a function so avoids issues with side effects when macro arguments are evaluated more than once */
+	{  
+	 cswap(a[i],a[j]); // use swap macro
+	} 
+
+#define elem_type_ss elem_type_median /* set type for smallsort correctly */	
+#include "ya-smallsort.h" // contains small_sort() - this needs to be included after cswap is defined
+
+/* 
+ 	yaselect()
+ 	==========
+ 	
+ Based on the algorithm in "Fast Deterministic Selection" by Andrei Alexandrescu, 16th International Symposium on Experimental Algorithms (SEA 2017) 
+
+ The original algorithm always used sampling so in theory its possible its runtime could become very large with some inputs, thats avoided here by (automatically) turning sampling off when this might happen.
+ This is done using the principle of introselect - see "Introspective sorting and selection algorithms" by D.R.Musser,Software practice and experience, 8:983-993, 1997.
+ Turning off sampling approximatly doubles the run time, so its left on whenever possible.
+ Code should always have a runtime O(n).
+ It also uses O(log2(n)) RAM [ stack ]. In the test program the max recursion depth was 17 without sampling, 13 with sampling for log2(n)=26.
+*/
+
+void select_driver(elem_type_median* r, size_t n, size_t length,int intro_cnt);
+
+/**
+expandPartitionRight(elem_type_median* r, size_t hi, size_t rite)
+Input assumptions:
+
+(a) hi <= rite
+(c) the range r[0 .. hi] contains elements no smaller than r[0]
+
+Output guarantee: same as Hoare partition using r[0] as pivot. Returns the new
+position of the pivot.
+*/
+static inline size_t expandPartitionRight(elem_type_median* r, size_t hi, size_t rite)
+{
+    size_t pivot = 0;
+    elem_type_median r0=r[0];
+    assert(pivot <= hi);
+    assert(hi <= rite);
+    // First loop: spend r[pivot .. hi]
+    for (; pivot < hi; --rite)
+    {
+        if (rite == hi) goto done;
+        if (r[rite] >= r0) continue;
+        ++pivot;
+        assert(r[pivot] >= r0);
+        cswap(r[rite], r[pivot]);
+    }
+    // Second loop: make left and pivot meet
+    for (; rite > pivot; --rite)
+    {
+        if (r[rite] >= r0) continue;
+        while (rite > pivot)
+        {
+            ++pivot;
+            if (r0 <r[pivot])
+            {
+                cswap(r[rite], r[pivot]);
+                break;
+            }
+        }
+    }
+
+done:
+    cswap(r[0], r[pivot]);
+    return pivot;
+}
+
+/**
+expandPartitionLeft(elem_type_median * r, size_t lo, size_t pivot)
+Input assumptions:
+
+(a) lo > 0, lo <= pivot
+(b) the range r[lo .. pivot] already contains elements no greater than r[pivot]
+
+Output guarantee: Same as Hoare partition around r[pivot]. Returns the new
+position of the pivot.
+
+*/
+static inline size_t expandPartitionLeft(elem_type_median * r, size_t lo, size_t pivot)
+{
+    assert(lo > 0 && lo <= pivot);
+    size_t left = 0;
+    const size_t oldPivot = pivot;
+    elem_type_median op=r[oldPivot];
+    for (; lo < pivot; ++left)
+    {
+        if (left == lo) goto done;
+        if (op >= r[left]) continue;
+        --pivot;
+        assert(op >= r[pivot]);
+        cswap(r[left], r[pivot]);
+    }
+    // Second loop: make left and pivot meet
+    for (;; ++left)
+    {
+        if (left == pivot) break;
+        if (op >= r[left]) continue;
+        for (;;)
+        {
+            if (left == pivot) goto done;
+            --pivot;
+            if (r[pivot] < op)
+            {
+                cswap(r[left], r[pivot]);
+                break;
+            }
+        }
+    }
+
+done:
+    cswap(r[oldPivot], r[pivot]);
+    return pivot;
+}
+
+/**
+expandPartition(elem_type_median* r, size_t lo, size_t pivot, size_t hi, size_t length)
+Input assumptions:
+
+(a) lo <= pivot, pivot < hi, hi <= length
+(b) the range r[lo .. pivot] already contains elements no greater than
+r[pivot]
+(c) the range r[pivot .. hi] already contains elements no smaller than
+r[pivot]
+
+Output guarantee: Same as Hoare partition around r[pivot], returning the new
+position of the pivot.
+*/
+static inline size_t expandPartition(elem_type_median* r, size_t lo, size_t pivot, size_t hi, size_t length)
+{
+    assert(lo <= pivot && pivot < hi && hi <= length);
+    --hi;
+    --length;
+    size_t left = 0;
+    elem_type_median rp=r[pivot];
+    for (;; ++left, --length)
+    {
+        for (;; ++left)
+        {
+            if (left == lo)
+                return pivot +
+                    expandPartitionRight(r + pivot, hi - pivot, length - pivot);
+            if (r[left] > rp) break;
+        }
+        for (;; --length)
+        {
+            if (length == hi)
+                return left +
+                    expandPartitionLeft(r + left, lo - left, pivot - left);
+            if (rp >= r[length]) break;
+        }
+        cswap(r[left], r[length]);
+    }
+}
+
+
+/**
+Returns the index of the median of r[a], r[b], and r[c] without writing
+anything.
+*/
+static inline size_t medianIndex(const elem_type_median* r, size_t a, size_t b, size_t c)
+{
+    if (r[a] > r[c]) tswap(a, c);
+    if (r[b] >r[c]) return c;
+    if (r[b] < r[a]) return a;
+    return b;
+}
+
+/**
+Tukey's Ninther: compute the median of r[_1], r[_2], r[_3], then the median of
+r[_4], r[_5], r[_6], then the median of r[_7], r[_8], r[_9], and then swap the
+median of those three medians into r[_5].
+*/
+static inline void ninther(elem_type_median* r, size_t _1, size_t _2, size_t _3, size_t _4, size_t _5,
+    size_t _6, size_t _7, size_t _8, size_t _9)
+{
+    _2 = medianIndex(r, _1, _2, _3);
+    _8 = medianIndex(r, _7, _8, _9);
+    if (r[_2] > r[_8]) tswap(_2, _8);
+    if (r[_4] > r[_6]) tswap(_4, _6);
+    // Here we know that r[_2] and r[_8] are the other two medians and that
+    // r[_2] <= r[_8]. We also know that r[_4] <= r[_6]
+    if (r[_5] < r[_4])
+    {
+        // r[_4] is the median of r[_4], r[_5], r[_6]
+    }
+    else if (r[_5] > r[_6])
+    {
+        // r[_6] is the median of r[_4], r[_5], r[_6]
+        _4 = _6;
+    }
+    else
+    {
+        // Here we know r[_5] is the median of r[_4], r[_5], r[_6]
+        if (r[_5] < r[_2]) 
+			{cswap(r[_5], r[_2]);
+			 return;
+			}
+        if (r[_5] > r[_8]) 
+			{cswap(r[_5], r[_8]);
+			 return;
+			}
+        // This is the only path that returns with no swap
+        return;
+    }
+    // Here we know r[_4] is the median of r[_4], r[_5], r[_6]
+    if (r[_4] < r[_2]) _4 = _2;
+    else if (r[_4] > r[_8]) _4 = _8;
+    cswap(r[_5], r[_4]);
+}
+
+/**
+Median of minima
+*/
+static inline size_t medianOfMinima(elem_type_median *const r, const size_t n, const size_t length,int intro_cnt)
+{
+    assert(length >= 2);
+    assert(n * 4 <= length);
+    assert(n > 0);
+    const size_t subset = n * 2,
+        computeMinOver = (length - subset) / subset;
+    assert(computeMinOver > 0);
+    for (size_t i = 0, j = subset; i < subset; ++i)
+    {
+        const size_t limit = j + computeMinOver;
+        size_t minIndex = j;
+        while (++j < limit)
+            if (r[j] < r[minIndex])
+                minIndex = j;
+        if (r[minIndex] < r[i])
+            cswap(r[i], r[minIndex]);
+        assert(j < length || i + 1 == subset);
+    }
+    select_driver(r, n, subset,intro_cnt);
+    return expandPartition(r, 0, n, subset, length);
+}
+
+/**
+Median of maxima
+*/
+static inline size_t medianOfMaxima(elem_type_median *const r, const size_t n, const size_t length,int intro_cnt)
+{
+    assert(length >= 2);
+    assert(n * 4 >= length * 3 && n < length);
+    const size_t subset = (length - n) * 2,
+        subsetStart = length - subset,
+        computeMaxOver = subsetStart / subset;
+    assert(computeMaxOver > 0);
+    for (size_t i = subsetStart, j = i - subset * computeMaxOver; i < length; ++i)
+    	{
+         const size_t limit = j + computeMaxOver;
+         size_t maxIndex = j;
+         while (++j < limit)
+            if (r[j] > r[maxIndex])
+                maxIndex = j;
+         if (r[maxIndex] > r[i])
+            cswap(r[i], r[maxIndex]);
+         assert(j != 0 || i + 1 == length);
+    	}
+    select_driver(r + subsetStart, length - n, subset,intro_cnt); 
+    return expandPartition(r, subsetStart, n, length, length);
+}
+
+/**
+Partitions r[0 .. length] using a pivot of its own choosing. Attempts to pick a
+pivot that approximates the median. Returns the position of the pivot.
+*/
+static inline size_t medianOfNinthers(elem_type_median *const r, const size_t length,int intro_cnt)
+{
+    assert(length >= 12);
+    size_t frac;
+	if(intro_cnt>=INTROMEDIAN_LIMIT)
+		{ 
+         frac =(length)/9; // no sampling [ /9 is smallest possible divisor see calculation of gap below  - this is because the ninther algorithm works on blocks of 9 ]
+    	}
+    else
+	   {	
+        /* algorithm is linear for any value of 0< f <=1 but it impacts speed of execution. frac = length*f [multiplication is key for proving o(n) ] */
+	    /* steps below are adaquate for 32 bit addresses, with 64 bit addresses its likley another "step" in the calculation of frac will be helpful to minimise the runtime */
+		if(length<=1024) frac=(length)/12; // 2^10 f=1/12 
+		else if(length<=128*1024) frac=(length)/64; // 128*1024=2^17 => f=1/64 - use power of 2 for efficient divide
+		else if(length<=64*1024*1024 ) frac=(length)/1024; // 64*1024*1024 =2^26 f=1/1024 - use power of 2 for efficient divide
+		else frac= (length)/(64*1024);// f=1/(64*1024)=1/65536 - use power of 2 for efficient divide
+	   }
+     
+    size_t pivot = frac / 2; // (frac-1)/2 is significantly slower here (eg 5.229 secs vs 4.886 secs) - but note frac is length/z so this is different to a normal pivot calculation 
+    const size_t lo = (length) / 2 - pivot, hi = lo + frac; // (length-1)/2 makes odd slightly faster, but even a little slower
+    assert(lo >= frac * 4);
+    assert(length - hi >= frac * 4);
+    assert(lo / 2 >= pivot);
+    const size_t gap = (length - 9 * frac) / 4;
+    size_t a = lo - 4 * frac - gap, b = hi + gap;
+    for (size_t i = lo; i < hi; ++i, a += 3, b += 3)
+    	{
+         ninther(r, a, i - frac, b, a + 1, i, b + 1, a + 2, i + frac, b + 2);
+    	}
+
+    select_driver(r + lo, pivot, frac,intro_cnt);
+    return expandPartition(r, lo, lo + pivot, hi, length);
+}
+
+/**
+
+driver for medianOfNinthers, medianOfMinima, and medianOfMaxima.
+Dispathes to each depending on the relationship between n (the sought order statistics) and length.
+n=0 "returns" the min, n=length-1 "returns" the max.
+In general it places the nth element of the array r (which has length elements) in the position it would be if the array was actually sorted.
+The algoritms also partitions the array r so that all elements to the left of the nth element are <= to those on its right
+It does this in linear time o(length) by using the principle of introselect - see "Introspective sorting and selection algorithms" by D.R.Musser,Software practice and experience, 8:983-993, 1997.
+Note this function is called recursively, from  medianOfNinthers(), medianOfMaxima() and medianOfMinima() so there is some extra RAM (stack) usage of O(log2(length)) (with a small multiplier) .
+
+The first, second, one before last and last elements are special optimised cases, and for short lengths a sort is used as thats more efficient than continueing with the main algorithm. 
+*/
+
+void select_driver(elem_type_median* r, size_t n, size_t length,int intro_cnt)
+{
+  // intro_cnt has to be passed as a function argument as this code is called recursively. If must be set to 0 on the first call 
+  size_t lastlen=0;
+  assert(n < length);
+  while(1) /* loop till done */
+       {
+        // Decide strategy for partitioning
+		/* treat looking for min as special case */
+	    if (n == 0)
+	        {
+             size_t imin = 0;// v is min value
+             elem_type_median v=r[0];
+             // printf("select_driver - min, length=%.0f\n",(double)n);
+             for (size_t i=1; i < length; ++i)
+                if (r[i] < v ) 
+					{imin = i;
+					 v=r[i];
+					}
+             cswap(r[0],r[imin]); 
+             return;// all done
+	        }
+	  
+	    /* treat looking for 2nd smallest min as special case */
+	    if (n == 1 && length>2)
+           {elem_type_median xmin=r[0],xmin1=r[1];// xin/imin = min; xmin1/imin1=2nd lowest
+            size_t imin,imin1;
+            if(xmin1<xmin)
+            	{xmin=r[1]; // swap as x[1] is lowest
+            	 xmin1=r[0];
+            	 imin=1;
+            	 imin1=0;
+            	}
+             else
+             	{imin=0;
+             	 imin1=1;
+             	}
+			for (size_t i=2; i < length; ++i)
+				{
+                 if (r[i] < xmin1 ) 
+					{if(r[i]<xmin)
+						{// new smallest
+						 xmin1=xmin;// old smallest is now 2nd smallest
+						 imin1=imin;
+						 xmin=r[i]; // new smallest
+						 imin=i;
+						}
+					 else
+					 	{// new 2nd smallest
+					 	 xmin1=r[i];
+					 	 imin1=i;
+					 	}
+					}
+				}
+		    // printf("select_driver - min2, length=%.0f imin=%.0f xmin=%.0f imin1=%.0f xmin1=%.0f\n",(double)n,(double)imin,(double)xmin,(double)imin1,(double)xmin1);	
+			/* put min at x[0] and min1 at x[1] - as we have to use swaps watch out for special cases ! */
+			if(imin1==0)
+				{cswap(r[0],r[1]);
+				 if(imin!=1) cswap(r[0],r[imin]);
+				}	
+			 else
+			 	{cswap(r[0],r[imin]);// partition guarantees items to the left are lower, so ensure this 		
+                 cswap(r[1],r[imin1]); 
+			 	}
+        
+            return;// all done
+           }
+	
+	    /* treat looking for max as special case */
+	    if (n + 1 == length)
+	        {
+             size_t imax = 0;
+             elem_type_median v=r[0]; // value at imax;
+             // printf("select_driver - max, length=%.0f\n",(double)n);
+             for (size_t i = 1; i < length; ++i)
+                if ( r[i] > v) 
+					{imax = i;
+					 v=r[i];
+					}
+             cswap(r[imax],r[n]); 
+             return;// all done
+	        }
+	    
+	    /* treat looking for 2nd highest as special case */
+	    if (n + 2 == length && length>2)
+           {elem_type_median xmax=r[0],xmax1=r[1];// xmax/imax = max; xmax1/imax1=2nd highest
+            size_t imax,imax1;
+            if(xmax1>xmax)
+            	{xmax=r[1]; // swap as x[1] is highest
+            	 xmax1=r[0];
+            	 imax=1;
+            	 imax1=0;
+            	}
+             else
+             	{imax=0;
+             	 imax1=1;
+             	}
+            
+            for (size_t i=2; i < length; ++i)
+            	{
+                 if (r[i] > xmax1 ) 
+					{if(r[i]>xmax)
+						{// new largest
+						 xmax1=xmax;// old largest is now 2nd largest
+						 imax1=imax;
+						 xmax=r[i]; // new largest
+						 imax=i;
+						}
+					 else
+					 	{// new 2nd largest
+					 	 xmax1=r[i];
+					 	 imax1=i;
+					 	}
+					}
+				}
+			// printf("select_driver - max2,  length=%.0f imax=%.0f xmax=%.0f imax1=%.0f xmax1=%.0f\n",(double)n,(double)imax,(double)xmax,(double)imax1,(double)xmax1);
+			// put max and max1 in their correct place in the array (max @ far end, max 1 just to its left )
+			if(imax1==n+1)
+				{cswap(r[n+1],r[n]);
+				 if(imax!=n) cswap(r[n+1],r[imax]);
+				}
+			 else
+			 	{
+				 cswap(r[n+1],r[imax]); // items higher must be to the right		
+            	 cswap(r[n],r[imax1]); 
+            	}
+            return;// all done
+           }        
+   		if(length<=16) // for small lengths its faster to use sorting . Sorting does a little more work that strictly necessary but thats typically only 1 extra compare/swap which is trivial overall.
+			{
+			 small_sort(r,length);
+			 return ; // all done
+			}
+   		if(intro_cnt<INTROMEDIAN_LIMIT)
+			{if( lastlen!=0 && length >= (lastlen-lastlen/INTROMEDIAN_DIV)  )
+				++intro_cnt; // reduction in n is not enough for specified geometric progression
+		 	 else if(intro_cnt > -INTROMEDIAN_LIMIT) 
+		 		--intro_cnt; // we are within specified geometric progression limits (if() test stops it becoming a very negative number which would then be slow to respond to a "bad patch"		
+   		 	 lastlen=length;
+   			}
+        assert(n < length);
+        size_t pivot;
+		 /* code that avoids overflows with big n */
+        if (n <= length/6)
+            pivot = medianOfMinima(r, n, length,intro_cnt);
+        else if (n  >= length-length/6)
+            pivot = medianOfMaxima(r, n, length,intro_cnt);
+        else
+        	{    			 
+             pivot = medianOfNinthers(r, length,intro_cnt);
+			}
+        // See how the pivot fares
+        if (pivot == n)
+        	{
+             return;
+        	}
+        if (pivot > n)
+        	{
+             length = pivot;
+        	}
+        else
+        	{
+             ++pivot;
+             r += pivot;
+             length -= pivot;
+             n -= pivot;
+        	}
+       }
+}
+
+elem_type_median yaMedian(elem_type_median *a, size_t s) // return median of array a using above code 
+{
+ select_driver(a,(s-1)/2,s,0);// 0 for normal use ; INTROMEDIAN_LIMIT to always use non-sampling method
+ return a[(s-1)/2];
+}
+
+void yaselect(elem_type_median* r, size_t n, size_t length) // select or nth_element . Places the n th element of a sequence in the position that it would be if the array was sorted. Changes array r. 
+{
+ select_driver(r, n, length,0); // select or nth_element . Places the n th element of a sequence in the position that it would be if the array was sorted. Changes array r. intro_cnt should be 0 on call.
+}
+
+/*   
+   This sort uses yaMedian() to create a simple and reasonably efficient quicksort with guaranteed  O(n*log(n)) execution time as the pivot value is always the median.
+   Code by Peter Miller.
+   This is ~ 50% slower than yasort on average
+   Note yaMedian uses introselect functionality so its worse case is abut 2* its average speed. 
+*/
+void ya_msort(elem_type_median *a,size_t n)
+{
+ elem_type_median m;
+ size_t p;
+ while(1)
+ 	{
+	 if(n<2) return;
+	 if(n<=32)
+	 	{small_sort(a,n); // this is faster than continueing the main algorithm all the way down to very small partitions
+	 	 return;
+	 	}
+	 // next try an insertion sort, abort this if its taking too much effort
+	 // this efficiently sorts arrays that are almost perfectly sorted already.
+	 // using this gives an average 33% speedup using the test program when MAX_INS_MOVES=2
+	 // There is also  a potentially useful side effect of this, if we have to move to using quicksort then 2 values will have been moved which could help break up bad patterns of data.
+#define MAX_INS_MOVES 2 /* max allowed number of moves - for the test program 2 is the optimum value */
+   	 size_t nos_ins_moves=0;	 
+   	 for (elem_type_median  *p =  &(a[1]); p<&(a[n]); ++p) 
+		{
+         elem_type_median  *q = p;
+         elem_type_median *q_1 = p-1;
+		 // Compare first so we can avoid 2 moves for an element already positioned correctly.
+         if (*q< *q_1) 
+		 	{if(++nos_ins_moves>MAX_INS_MOVES) goto do_qsort; // too many moves - swap to qsort
+           	 elem_type_median t = *q;
+           	 do 
+			 	{ q--;
+				}
+         	 	while (q != a && t < *--q_1); 
+         	 memmove(q+1,q,(p-q)*sizeof(elem_type_median));// move a portion of array x right by 1 to make space for t 
+           	 *q = t; // insert t in its correct place in array x
+        	}     	
+        }	
+   	 assert(check_sort( a, n) ); // check result of sort is ordered correctly
+   	 return ; // all done
+do_qsort: 	 	
+	 m=yaMedian(a,n);// get median, as a "side effect" also puts values <= the median to the left of the array and values >= the meadian on the right. 
+	 p=(n-1)/2; // position of median
+#ifdef DEBUG 
+	 // check that elements in a before median are <= median
+	 for(size_t i=0;i<p;++i)
+	 	if(a[i]>m) printf("msort(a,%.0f) a[%.0f]=%.0f expected <= median(%.0f)\n",(double)n,(double)i,(double)a[i],(double)m);
+	 // check that elements in a after median are >= median
+	 for(size_t i=p;i<n;++i)
+	 	if(a[i]<m) printf("msort(a,%.0f) a[%.0f]=%.0f expected >= median(%.0f)\n",(double)n,(double)i,(double)a[i],(double)m); 	
+#endif
+	 while(p>1 && a[p-1]==m) p--; // scan down until we find a value thats != median (values that are = to median are already sorted)
+	 ya_msort(a,p); // recursion - but depth max log2(n) as we split input at true median 
+	 p=(n-1)/2;// back to known position of median
+	 while(p<n-2 && a[p+1]==m) p++;// scan up until we find a value thats != median (values that are = to median are already sorted)
+	 a+=p;// ready for next iteration; removes tail recursion - ya_msort(a+p,n-p)
+	 n-=p; 
+	}	
+}
+
diff --git a/ya-smallsort.h b/ya-smallsort.h
new file mode 100644
index 0000000..564d507
--- /dev/null
+++ b/ya-smallsort.h
@@ -0,0 +1,469 @@
+/* ya_smallsort.h - this is part of yasort and yamedian */
+/* warnings
+		- uses define Z (undefines it at the end)
+		- assumes elem_type_ss and cswap() [eswap2 for small_sort2()]  are already defined 
+*/		   
+#ifndef __YA_SMALLSORT_H
+ #define __YA_SMALLSORT_H
+
+#ifdef Z
+ #error "Error macro Z already defined!"
+#endif
+/*  small_sort() - used for small arrays
+*/
+/*  optimal sorts for small array sizes will sort a max size of array of 32 ! */
+/*  This is a little faster than using an optimised insertion sort (~ 1.4%) using the test program */
+/*  small_sort2() is ~ 6% faster than using an insertion sort based on its test program */
+#ifdef _SORT2 /* 2 array version required */
+static void small_sort2( elem_type_ss *x, elem_type_ss *y,const size_t n)
+ {
+ #define Z(i,j) if(x[i] > x[j]) eswap2(i,j,x,y)
+#else /* single array version */
+static void small_sort( elem_type_ss *x, const size_t n)
+ {
+ #define Z(i,j) if(x[i] > x[j]) cswap(x[i],x[j])
+ // #define Z(i,j) if(x[i] > x[j]) eswap(i,j,x)
+#endif	 
+  assert(n<=32);
+  switch(n)
+   {// use optimal sorts for small sizes (2 to 32), each optimal sort terminates in a return statement
+	case 2:
+	 {  /* Two elements only - can trivially sort these */
+	  Z(0,1);
+	  return ;
+	 }	 
+	case 3:
+	 {  /* Three elements only - can sort these inline from "Programming classics" page 162 section 6.1.2 Sort-3 */
+		Z(0,1);   // X1, X2
+		Z(0,2);     // X1, X3
+		Z(1,2);   // X2,X3
+	  return ;
+	 }
+	case 4:
+	 {  /* four elements only - can sort these inline from "Programming classics" page 162 section 6.1.2 Sort-4 */
+		Z(0,1);   // X1, X2
+		Z(2,3);     // X3, X4
+		Z(0,2);     // X1, X3
+		Z(1,3);   // X2,X4
+		Z(1,2);   // X2,X3
+	  return ;
+	 }
+	case 5:
+	{ /* five elements only - can sort these inline from "Programming classics" page 162 section 6.1.2 Sort-5 */
+	  /* sort3(X1,X2,X3) */
+		Z(0,1);   // X1, X2
+		Z(0,2);     // X1, X3
+		Z(1,2);   // X2,X3
+	  // calls to sort(2)
+		Z(3,4);   // X4, X5
+		Z(0,3);   // X1 , X4
+		Z(2,3);   // X3, X4
+		Z(1,4);   // X2, X5
+		Z(1,2);   // X2, X3
+		Z(3,4);   // X4, X5
+	  return;
+	}
+	case 6:
+	{ /* six elements only - can sort these inline from "Programming classics" page 162 section 6.1.2 Sort-6 */
+	  /* sort3(X1,X2,X3) */
+		Z(0,1);   // X1, X2
+		Z(0,2);     // X1, X3
+		Z(1,2);   // X2,X3
+	  /* sort3(X4,X5,X6) */
+		Z(3,4);   // X4, X5
+		Z(3,5);     // X4, X6
+		Z(4,5);   // X5,X6
+	  // calls to sort(2)
+		Z(0,3);   // X1 , X4
+		Z(2,5);   // X3, X6
+		Z(2,3);   // X3, X4
+		Z(1,4);   // X2, X5
+		Z(1,2);   // X2, X3
+		Z(3,4);   // X4, X5
+	  return;
+	}
+	case 7: // sort7() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{ 	Z(0,6);Z(2,3);Z(4,5);
+		Z(0,2);Z(1,4);Z(3,6);
+		Z(0,1);Z(2,5);Z(3,4);
+		Z(1,2);Z(4,6);
+		Z(2,3);Z(4,5);
+		Z(1,2);Z(3,4);Z(5,6);
+	  return;
+	}
+	case 8: // sort8() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{ 	Z(0,2);Z(1,3);Z(4,6);Z(5,7);
+		Z(0,4);Z(1,5);Z(2,6);Z(3,7);
+		Z(0,1);Z(2,3);Z(4,5);Z(6,7);
+		Z(2,4);Z(3,5);
+		Z(1,4);Z(3,6);
+		Z(1,2);Z(3,4);Z(5,6);
+	  return;
+	}
+	case 9: // sort9() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{ 	Z(0,3);Z(1,7);Z(2,5);Z(4,8);
+		Z(0,7);Z(2,4);Z(3,8);Z(5,6);
+		Z(0,2);Z(1,3);Z(4,5);Z(7,8);
+		Z(1,4);Z(3,6);Z(5,7);
+		Z(0,1);Z(2,4);Z(3,5);Z(6,8);
+		Z(2,3);Z(4,5);Z(6,7);
+		Z(1,2);Z(3,4);Z(5,6);
+	  return;
+	}
+	case 10: // sort10() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,8);Z(1,9);Z(2,7);Z(3,5);Z(4,6);
+		Z(0,2);Z(1,4);Z(5,8);Z(7,9);
+		Z(0,3);Z(2,4);Z(5,7);Z(6,9);
+		Z(0,1);Z(3,6);Z(8,9);
+		Z(1,5);Z(2,3);Z(4,8);Z(6,7);
+		Z(1,2);Z(3,5);Z(4,6);Z(7,8);
+		Z(2,3);Z(4,5);Z(6,7);
+		Z(3,4);Z(5,6);
+	  return;
+	}
+	case 11: // sort11() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{  	Z(0,9);Z(1,6);Z(2,4);Z(3,7);Z(5,8);
+		Z(0,1);Z(3,5);Z(4,10);Z(6,9);Z(7,8);
+		Z(1,3);Z(2,5);Z(4,7);Z(8,10);
+		Z(0,4);Z(1,2);Z(3,7);Z(5,9);Z(6,8);
+		Z(0,1);Z(2,6);Z(4,5);Z(7,8);Z(9,10);
+		Z(2,4);Z(3,6);Z(5,7);Z(8,9);
+		Z(1,2);Z(3,4);Z(5,6);Z(7,8);
+		Z(2,3);Z(4,5);Z(6,7);
+	  return;
+	}
+	case 12: // sort12() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,8);Z(1,7);Z(2,6);Z(3,11);Z(4,10);Z(5,9);
+		Z(0,1);Z(2,5);Z(3,4);Z(6,9);Z(7,8);Z(10,11);
+		Z(0,2);Z(1,6);Z(5,10);Z(9,11);
+		Z(0,3);Z(1,2);Z(4,6);Z(5,7);Z(8,11);Z(9,10);
+		Z(1,4);Z(3,5);Z(6,8);Z(7,10);
+		Z(1,3);Z(2,5);Z(6,9);Z(8,10);
+		Z(2,3);Z(4,5);Z(6,7);Z(8,9);
+		Z(4,6);Z(5,7);
+		Z(3,4);Z(5,6);Z(7,8);
+	  return;
+	}
+	case 13: // sort13() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,12);Z(1,10);Z(2,9);Z(3,7);Z(5,11);Z(6,8);
+		Z(1,6);Z(2,3);Z(4,11);Z(7,9);Z(8,10);
+		Z(0,4);Z(1,2);Z(3,6);Z(7,8);Z(9,10);Z(11,12);
+		Z(4,6);Z(5,9);Z(8,11);Z(10,12);
+		Z(0,5);Z(3,8);Z(4,7);Z(6,11);Z(9,10);
+		Z(0,1);Z(2,5);Z(6,9);Z(7,8);Z(10,11);
+		Z(1,3);Z(2,4);Z(5,6);Z(9,10);
+		Z(1,2);Z(3,4);Z(5,7);Z(6,8);
+		Z(2,3);Z(4,5);Z(6,7);Z(8,9);
+		Z(3,4);Z(5,6);
+	  return;
+	}
+	case 14: // sort14() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,6);Z(1,11);Z(2,12);Z(3,10);Z(4,5);Z(7,13);Z(8,9);
+		Z(1,2);Z(3,7);Z(4,8);Z(5,9);Z(6,10);Z(11,12);
+		Z(0,4);Z(1,3);Z(5,6);Z(7,8);Z(9,13);Z(10,12);
+		Z(0,1);Z(2,9);Z(3,7);Z(4,11);Z(6,10);Z(12,13);
+		Z(2,5);Z(4,7);Z(6,9);Z(8,11);
+		Z(1,2);Z(3,4);Z(6,7);Z(9,10);Z(11,12);
+		Z(1,3);Z(2,4);Z(5,6);Z(7,8);Z(9,11);Z(10,12);
+		Z(2,3);Z(4,7);Z(6,9);Z(10,11);
+		Z(4,5);Z(6,7);Z(8,9);
+		Z(3,4);Z(5,6);Z(7,8);Z(9,10);
+	  return;
+	}
+	case 15: // sort15() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(1,2);Z(3,10);Z(4,14);Z(5,8);Z(6,13);Z(7,12);Z(9,11);
+		Z(0,14);Z(1,5);Z(2,8);Z(3,7);Z(6,9);Z(10,12);Z(11,13);
+		Z(0,7);Z(1,6);Z(2,9);Z(4,10);Z(5,11);Z(8,13);Z(12,14);
+		Z(0,6);Z(2,4);Z(3,5);Z(7,11);Z(8,10);Z(9,12);Z(13,14);
+		Z(0,3);Z(1,2);Z(4,7);Z(5,9);Z(6,8);Z(10,11);Z(12,13);
+		Z(0,1);Z(2,3);Z(4,6);Z(7,9);Z(10,12);Z(11,13);
+		Z(1,2);Z(3,5);Z(8,10);Z(11,12);
+		Z(3,4);Z(5,6);Z(7,8);Z(9,10);
+		Z(2,3);Z(4,5);Z(6,7);Z(8,9);Z(10,11);
+		Z(5,6);Z(7,8);
+	  return;
+	}
+	case 16: // sort16() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,13);Z(1,12);Z(2,15);Z(3,14);Z(4,8);Z(5,6);Z(7,11);Z(9,10);
+		Z(0,5);Z(1,7);Z(2,9);Z(3,4);Z(6,13);Z(8,14);Z(10,15);Z(11,12);
+		Z(0,1);Z(2,3);Z(4,5);Z(6,8);Z(7,9);Z(10,11);Z(12,13);Z(14,15);
+		Z(0,2);Z(1,3);Z(4,10);Z(5,11);Z(6,7);Z(8,9);Z(12,14);Z(13,15);
+		Z(1,2);Z(3,12);Z(4,6);Z(5,7);Z(8,10);Z(9,11);Z(13,14);
+		Z(1,4);Z(2,6);Z(5,8);Z(7,10);Z(9,13);Z(11,14);
+		Z(2,4);Z(3,6);Z(9,12);Z(11,13);
+		Z(3,5);Z(6,8);Z(7,9);Z(10,12);
+		Z(3,4);Z(5,6);Z(7,8);Z(9,10);Z(11,12);
+		Z(6,7);Z(8,9);
+	  return;
+	}
+	case 17: // sort17() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,11);Z(1,15);Z(2,10);Z(3,5);Z(4,6);Z(8,12);Z(9,16);Z(13,14);
+		Z(0,6);Z(1,13);Z(2,8);Z(4,14);Z(5,15);Z(7,11);
+		Z(0,8);Z(3,7);Z(4,9);Z(6,16);Z(10,11);Z(12,14);
+		Z(0,2);Z(1,4);Z(5,6);Z(7,13);Z(8,9);Z(10,12);Z(11,14);Z(15,16);
+		Z(0,3);Z(2,5);Z(6,11);Z(7,10);Z(9,13);Z(12,15);Z(14,16);
+		Z(0,1);Z(3,4);Z(5,10);Z(6,9);Z(7,8);Z(11,15);Z(13,14);
+		Z(1,2);Z(3,7);Z(4,8);Z(6,12);Z(11,13);Z(14,15);
+		Z(1,3);Z(2,7);Z(4,5);Z(9,11);Z(10,12);Z(13,14);
+		Z(2,3);Z(4,6);Z(5,7);Z(8,10);
+		Z(3,4);Z(6,8);Z(7,9);Z(10,12);
+		Z(5,6);Z(7,8);Z(9,10);Z(11,12);
+		Z(4,5);Z(6,7);Z(8,9);Z(10,11);Z(12,13);
+	  return;
+	}
+	case 18: // sort18() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,1);Z(2,3);Z(4,5);Z(6,7);Z(8,9);Z(10,11);Z(12,13);Z(14,15);Z(16,17);
+		Z(1,5);Z(2,6);Z(3,7);Z(4,10);Z(8,16);Z(9,17);Z(12,14);Z(13,15);
+		Z(0,8);Z(1,10);Z(2,12);Z(3,14);Z(6,13);Z(7,15);Z(9,16);Z(11,17);
+		Z(0,4);Z(1,9);Z(5,17);Z(8,11);Z(10,16);
+		Z(0,2);Z(1,6);Z(4,10);Z(5,9);Z(14,16);Z(15,17);
+		Z(1,2);Z(3,10);Z(4,12);Z(5,7);Z(6,14);Z(9,13);Z(15,16);
+		Z(3,8);Z(5,12);Z(7,11);Z(9,10);
+		Z(3,4);Z(6,8);Z(7,14);Z(9,12);Z(11,13);
+		Z(1,3);Z(2,4);Z(7,9);Z(8,12);Z(11,15);Z(13,16);
+		Z(2,3);Z(4,5);Z(6,7);Z(10,11);Z(12,14);Z(13,15);
+		Z(4,6);Z(5,8);Z(9,10);Z(11,14);
+		Z(3,4);Z(5,7);Z(8,9);Z(10,12);Z(13,14);
+		Z(5,6);Z(7,8);Z(9,10);Z(11,12);
+	  return;
+	}
+	case 19: // sort19() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,12);Z(1,4);Z(2,8);Z(3,5);Z(6,17);Z(7,11);Z(9,14);Z(10,13);Z(15,16);
+		Z(0,2);Z(1,7);Z(3,6);Z(4,11);Z(5,17);Z(8,12);Z(10,15);Z(13,16);Z(14,18);
+		Z(3,10);Z(4,14);Z(5,15);Z(6,13);Z(7,9);Z(11,17);Z(16,18);
+		Z(0,7);Z(1,10);Z(4,6);Z(9,15);Z(11,16);Z(12,17);Z(13,14);
+		Z(0,3);Z(2,6);Z(5,7);Z(8,11);Z(12,16);
+		Z(1,8);Z(2,9);Z(3,4);Z(6,15);Z(7,13);Z(10,11);Z(12,18);
+		Z(1,3);Z(2,5);Z(6,9);Z(7,12);Z(8,10);Z(11,14);Z(17,18);
+		Z(0,1);Z(2,3);Z(4,8);Z(6,10);Z(9,12);Z(14,15);Z(16,17);
+		Z(1,2);Z(5,8);Z(6,7);Z(9,11);Z(10,13);Z(14,16);Z(15,17);
+		Z(3,6);Z(4,5);Z(7,9);Z(8,10);Z(11,12);Z(13,14);Z(15,16);
+		Z(3,4);Z(5,6);Z(7,8);Z(9,10);Z(11,13);Z(12,14);
+		Z(2,3);Z(4,5);Z(6,7);Z(8,9);Z(10,11);Z(12,13);Z(14,15);
+	  return;
+	}
+	case 20: // sort20() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,3);Z(1,7);Z(2,5);Z(4,8);Z(6,9);Z(10,13);Z(11,15);Z(12,18);Z(14,17);Z(16,19);
+		Z(0,14);Z(1,11);Z(2,16);Z(3,17);Z(4,12);Z(5,19);Z(6,10);Z(7,15);Z(8,18);Z(9,13);
+		Z(0,4);Z(1,2);Z(3,8);Z(5,7);Z(11,16);Z(12,14);Z(15,19);Z(17,18);
+		Z(1,6);Z(2,12);Z(3,5);Z(4,11);Z(7,17);Z(8,15);Z(13,18);Z(14,16);
+		Z(0,1);Z(2,6);Z(7,10);Z(9,12);Z(13,17);Z(18,19);
+		Z(1,6);Z(5,9);Z(7,11);Z(8,12);Z(10,14);Z(13,18);
+		Z(3,5);Z(4,7);Z(8,10);Z(9,11);Z(12,15);Z(14,16);
+		Z(1,3);Z(2,4);Z(5,7);Z(6,10);Z(9,13);Z(12,14);Z(15,17);Z(16,18);
+		Z(1,2);Z(3,4);Z(6,7);Z(8,9);Z(10,11);Z(12,13);Z(15,16);Z(17,18);
+		Z(2,3);Z(4,6);Z(5,8);Z(7,9);Z(10,12);Z(11,14);Z(13,15);Z(16,17);
+		Z(4,5);Z(6,8);Z(7,10);Z(9,12);Z(11,13);Z(14,15);
+		Z(3,4);Z(5,6);Z(7,8);Z(9,10);Z(11,12);Z(13,14);Z(15,16);
+	  return;
+	}
+	case 21: // sort21() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,7);Z(1,10);Z(3,5);Z(4,8);Z(6,13);Z(9,19);Z(11,14);Z(12,17);Z(15,16);Z(18,20);
+		Z(0,11);Z(1,15);Z(2,12);Z(3,4);Z(5,8);Z(6,9);Z(7,14);Z(10,16);Z(13,19);Z(17,20);
+		Z(0,6);Z(1,3);Z(2,18);Z(4,15);Z(5,10);Z(8,16);Z(11,17);Z(12,13);Z(14,20);
+		Z(2,6);Z(5,12);Z(7,18);Z(8,14);Z(9,11);Z(10,17);Z(13,19);Z(16,20);
+		Z(1,2);Z(4,7);Z(5,9);Z(6,17);Z(10,13);Z(11,12);Z(14,19);Z(15,18);
+		Z(0,2);Z(3,6);Z(4,5);Z(7,10);Z(8,11);Z(9,15);Z(12,16);Z(13,18);Z(14,17);Z(19,20);
+		Z(0,1);Z(2,3);Z(5,9);Z(6,12);Z(7,8);Z(11,14);Z(13,15);Z(16,19);Z(17,18);
+		Z(1,2);Z(3,9);Z(6,13);Z(10,11);Z(12,15);Z(16,17);Z(18,19);
+		Z(1,4);Z(2,5);Z(3,7);Z(6,10);Z(8,9);Z(11,12);Z(13,14);Z(17,18);
+		Z(2,4);Z(5,6);Z(7,8);Z(9,11);Z(10,13);Z(12,15);Z(14,16);
+		Z(3,4);Z(5,7);Z(6,8);Z(9,10);Z(11,13);Z(12,14);Z(15,16);
+		Z(4,5);Z(6,7);Z(8,9);Z(10,11);Z(12,13);Z(14,15);Z(16,17);
+	  return;
+	}
+	case 22: // sort22() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,1);Z(2,3);Z(4,5);Z(6,7);Z(8,9);Z(10,11);Z(12,13);Z(14,15);Z(16,17);Z(18,19);Z(20,21);
+		Z(0,12);Z(1,13);Z(2,6);Z(3,7);Z(4,10);Z(8,20);Z(9,21);Z(11,17);Z(14,18);Z(15,19);
+		Z(0,2);Z(1,6);Z(3,12);Z(4,16);Z(5,17);Z(7,13);Z(8,14);Z(9,18);Z(15,20);Z(19,21);
+		Z(0,8);Z(1,15);Z(2,14);Z(3,9);Z(5,11);Z(6,20);Z(7,19);Z(10,16);Z(12,18);Z(13,21);
+		Z(0,4);Z(1,10);Z(3,8);Z(5,9);Z(7,14);Z(11,20);Z(12,16);Z(13,18);Z(17,21);
+		Z(1,3);Z(2,5);Z(4,8);Z(6,9);Z(7,10);Z(11,14);Z(12,15);Z(13,17);Z(16,19);Z(18,20);
+		Z(2,4);Z(3,12);Z(5,8);Z(6,11);Z(9,18);Z(10,15);Z(13,16);Z(17,19);
+		Z(1,2);Z(3,4);Z(5,7);Z(6,12);Z(8,11);Z(9,15);Z(10,13);Z(14,16);Z(17,18);Z(19,20);
+		Z(2,3);Z(4,5);Z(7,12);Z(8,10);Z(9,14);Z(11,13);Z(16,17);Z(18,19);
+		Z(4,6);Z(5,8);Z(9,11);Z(10,12);Z(13,16);Z(15,17);
+		Z(3,4);Z(6,7);Z(9,10);Z(11,12);Z(14,15);Z(17,18);
+		Z(5,6);Z(7,8);Z(10,11);Z(13,14);Z(15,16);
+		Z(6,7);Z(8,9);Z(12,13);Z(14,15);
+	  return;
+	}
+	case 23: // sort23() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,20);Z(1,12);Z(2,16);Z(4,6);Z(5,10);Z(7,21);Z(8,14);Z(9,15);Z(11,22);Z(13,18);Z(17,19);
+		Z(0,3);Z(1,11);Z(2,7);Z(4,17);Z(5,13);Z(6,19);Z(8,9);Z(10,18);Z(12,22);Z(14,15);Z(16,21);
+		Z(0,1);Z(2,4);Z(3,12);Z(5,8);Z(6,9);Z(7,10);Z(11,20);Z(13,16);Z(14,17);Z(15,18);Z(19,21);
+		Z(2,5);Z(4,8);Z(6,11);Z(7,14);Z(9,16);Z(12,17);Z(15,19);Z(18,21);
+		Z(1,8);Z(3,14);Z(4,7);Z(9,20);Z(10,12);Z(11,13);Z(15,22);Z(16,19);
+		Z(0,7);Z(1,5);Z(3,4);Z(6,11);Z(8,15);Z(9,14);Z(10,13);Z(12,17);Z(18,22);Z(19,20);
+		Z(0,2);Z(1,6);Z(4,7);Z(5,9);Z(8,10);Z(13,15);Z(14,18);Z(16,19);Z(17,22);Z(20,21);
+		Z(2,3);Z(4,5);Z(6,8);Z(7,9);Z(10,11);Z(12,13);Z(14,16);Z(15,17);Z(18,19);Z(21,22);
+		Z(1,2);Z(3,6);Z(4,10);Z(7,8);Z(9,11);Z(12,14);Z(13,19);Z(15,16);Z(17,20);
+		Z(2,3);Z(5,10);Z(6,7);Z(8,9);Z(13,18);Z(14,15);Z(16,17);Z(20,21);
+		Z(3,4);Z(5,7);Z(10,12);Z(11,13);Z(16,18);Z(19,20);
+		Z(4,6);Z(8,10);Z(9,12);Z(11,14);Z(13,15);Z(17,19);
+		Z(5,6);Z(7,8);Z(9,10);Z(11,12);Z(13,14);Z(15,16);Z(17,18);
+	  return;
+	}
+	case 24: // sort24() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,20);Z(1,12);Z(2,16);Z(3,23);Z(4,6);Z(5,10);Z(7,21);Z(8,14);Z(9,15);Z(11,22);Z(13,18);Z(17,19);
+		Z(0,3);Z(1,11);Z(2,7);Z(4,17);Z(5,13);Z(6,19);Z(8,9);Z(10,18);Z(12,22);Z(14,15);Z(16,21);Z(20,23);
+		Z(0,1);Z(2,4);Z(3,12);Z(5,8);Z(6,9);Z(7,10);Z(11,20);Z(13,16);Z(14,17);Z(15,18);Z(19,21);Z(22,23);
+		Z(2,5);Z(4,8);Z(6,11);Z(7,14);Z(9,16);Z(12,17);Z(15,19);Z(18,21);
+		Z(1,8);Z(3,14);Z(4,7);Z(9,20);Z(10,12);Z(11,13);Z(15,22);Z(16,19);
+		Z(0,7);Z(1,5);Z(3,4);Z(6,11);Z(8,15);Z(9,14);Z(10,13);Z(12,17);Z(16,23);Z(18,22);Z(19,20);
+		Z(0,2);Z(1,6);Z(4,7);Z(5,9);Z(8,10);Z(13,15);Z(14,18);Z(16,19);Z(17,22);Z(21,23);
+		Z(2,3);Z(4,5);Z(6,8);Z(7,9);Z(10,11);Z(12,13);Z(14,16);Z(15,17);Z(18,19);Z(20,21);
+		Z(1,2);Z(3,6);Z(4,10);Z(7,8);Z(9,11);Z(12,14);Z(13,19);Z(15,16);Z(17,20);Z(21,22);
+		Z(2,3);Z(5,10);Z(6,7);Z(8,9);Z(13,18);Z(14,15);Z(16,17);Z(20,21);
+		Z(3,4);Z(5,7);Z(10,12);Z(11,13);Z(16,18);Z(19,20);
+		Z(4,6);Z(8,10);Z(9,12);Z(11,14);Z(13,15);Z(17,19);
+		Z(5,6);Z(7,8);Z(9,10);Z(11,12);Z(13,14);Z(15,16);Z(17,18);
+	  return;
+	}
+	case 25: // sort25() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,2);Z(1,8);Z(3,18);Z(4,17);Z(5,20);Z(6,19);Z(7,9);Z(10,11);Z(12,13);Z(14,16);Z(15,22);Z(21,23);
+		Z(0,3);Z(1,15);Z(2,18);Z(4,12);Z(5,21);Z(6,10);Z(7,14);Z(8,22);Z(9,16);Z(11,19);Z(13,17);Z(20,23);
+		Z(0,4);Z(1,7);Z(2,13);Z(3,12);Z(5,6);Z(8,14);Z(9,15);Z(10,21);Z(11,20);Z(16,22);Z(17,18);Z(19,23);
+		Z(0,5);Z(2,11);Z(3,6);Z(4,10);Z(7,16);Z(8,9);Z(12,21);Z(13,19);Z(14,15);Z(17,20);Z(18,23);
+		Z(2,7);Z(6,9);Z(8,11);Z(14,24);Z(18,21);
+		Z(3,8);Z(7,10);Z(11,12);Z(13,14);Z(15,21);Z(18,20);Z(22,24);
+		Z(4,13);Z(10,16);Z(11,15);Z(18,24);Z(19,22);
+		Z(1,4);Z(8,11);Z(9,19);Z(13,17);Z(14,18);Z(16,20);Z(23,24);
+		Z(0,1);Z(4,5);Z(6,13);Z(9,14);Z(10,17);Z(12,16);Z(18,19);Z(20,21);Z(22,23);
+		Z(2,6);Z(3,4);Z(5,13);Z(7,9);Z(12,18);Z(15,17);Z(16,19);Z(20,22);Z(21,23);
+		Z(1,2);Z(5,8);Z(6,7);Z(9,10);Z(11,13);Z(14,15);Z(17,20);Z(21,22);
+		Z(1,3);Z(2,4);Z(5,6);Z(7,11);Z(8,9);Z(10,13);Z(12,14);Z(15,16);Z(17,18);Z(19,20);
+		Z(2,3);Z(4,8);Z(6,7);Z(9,12);Z(10,11);Z(13,14);Z(15,17);Z(16,18);Z(20,21);
+		Z(3,5);Z(4,6);Z(7,8);Z(9,10);Z(11,12);Z(13,15);Z(14,17);Z(16,19);
+		Z(4,5);Z(6,7);Z(8,9);Z(10,11);Z(12,13);Z(14,15);Z(16,17);Z(18,19);
+	  return;
+	}
+	case 26: // sort26() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,25);Z(1,3);Z(2,9);Z(4,19);Z(5,18);Z(6,21);Z(7,20);Z(8,10);Z(11,12);Z(13,14);Z(15,17);Z(16,23);Z(22,24);
+		Z(1,4);Z(2,16);Z(3,19);Z(5,13);Z(6,22);Z(7,11);Z(8,15);Z(9,23);Z(10,17);Z(12,20);Z(14,18);Z(21,24);
+		Z(1,5);Z(2,8);Z(3,14);Z(4,13);Z(6,7);Z(9,15);Z(10,16);Z(11,22);Z(12,21);Z(17,23);Z(18,19);Z(20,24);
+		Z(0,10);Z(1,6);Z(3,7);Z(4,11);Z(5,12);Z(13,20);Z(14,21);Z(15,25);Z(18,22);Z(19,24);
+		Z(0,4);Z(8,10);Z(12,13);Z(15,17);Z(21,25);
+		Z(0,2);Z(4,8);Z(10,12);Z(13,15);Z(17,21);Z(23,25);
+		Z(0,1);Z(2,3);Z(4,5);Z(8,14);Z(9,13);Z(11,17);Z(12,16);Z(20,21);Z(22,23);Z(24,25);
+		Z(1,4);Z(3,10);Z(6,9);Z(7,13);Z(8,11);Z(12,18);Z(14,17);Z(15,22);Z(16,19);Z(21,24);
+		Z(2,6);Z(3,8);Z(5,7);Z(9,12);Z(13,16);Z(17,22);Z(18,20);Z(19,23);
+		Z(1,2);Z(4,6);Z(5,9);Z(7,10);Z(11,12);Z(13,14);Z(15,18);Z(16,20);Z(19,21);Z(23,24);
+		Z(2,4);Z(3,5);Z(7,13);Z(8,9);Z(10,14);Z(11,15);Z(12,18);Z(16,17);Z(20,22);Z(21,23);
+		Z(3,4);Z(6,9);Z(7,11);Z(10,12);Z(13,15);Z(14,18);Z(16,19);Z(21,22);
+		Z(5,7);Z(6,8);Z(9,13);Z(10,11);Z(12,16);Z(14,15);Z(17,19);Z(18,20);
+		Z(5,6);Z(7,8);Z(9,10);Z(11,13);Z(12,14);Z(15,16);Z(17,18);Z(19,20);
+		Z(4,5);Z(6,7);Z(8,9);Z(10,11);Z(12,13);Z(14,15);Z(16,17);Z(18,19);Z(20,21);
+	  return;
+	}
+	case 27: // sort27() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,9);Z(1,6);Z(2,4);Z(3,7);Z(5,8);Z(11,24);Z(12,23);Z(13,26);Z(14,25);Z(15,19);Z(16,17);Z(18,22);Z(20,21);
+		Z(0,1);Z(3,5);Z(4,10);Z(6,9);Z(7,8);Z(11,16);Z(12,18);Z(13,20);Z(14,15);Z(17,24);Z(19,25);Z(21,26);Z(22,23);
+		Z(1,3);Z(2,5);Z(4,7);Z(8,10);Z(11,12);Z(13,14);Z(15,16);Z(17,19);Z(18,20);Z(21,22);Z(23,24);Z(25,26);
+		Z(0,4);Z(1,2);Z(3,7);Z(5,9);Z(6,8);Z(11,13);Z(12,14);Z(15,21);Z(16,22);Z(17,18);Z(19,20);Z(23,25);Z(24,26);
+		Z(0,1);Z(2,6);Z(4,5);Z(7,8);Z(9,10);Z(12,13);Z(14,23);Z(15,17);Z(16,18);Z(19,21);Z(20,22);Z(24,25);
+		Z(0,11);Z(2,4);Z(3,6);Z(5,7);Z(8,9);Z(12,15);Z(13,17);Z(16,19);Z(18,21);Z(20,24);Z(22,25);
+		Z(1,2);Z(3,4);Z(5,6);Z(7,8);Z(13,15);Z(14,17);Z(20,23);Z(22,24);
+		Z(1,12);Z(2,3);Z(4,5);Z(6,7);Z(14,16);Z(17,19);Z(18,20);Z(21,23);
+		Z(2,13);Z(14,15);Z(16,17);Z(18,19);Z(20,21);Z(22,23);
+		Z(3,14);Z(4,15);Z(5,16);Z(10,21);Z(17,18);Z(19,20);
+		Z(6,17);Z(7,18);Z(8,19);Z(9,20);Z(10,13);Z(14,22);Z(15,23);Z(16,24);
+		Z(6,10);Z(7,14);Z(8,11);Z(9,12);Z(17,25);Z(18,26);Z(19,23);Z(20,24);
+		Z(4,8);Z(5,9);Z(11,15);Z(12,16);Z(13,17);Z(18,22);Z(21,25);Z(24,26);
+		Z(2,4);Z(3,5);Z(6,8);Z(7,9);Z(10,11);Z(12,14);Z(13,15);Z(16,18);Z(17,19);Z(20,22);Z(21,23);Z(25,26);
+		Z(1,2);Z(3,4);Z(5,6);Z(7,8);Z(9,10);Z(11,12);Z(13,14);Z(15,16);Z(17,18);Z(19,20);Z(21,22);Z(23,24);
+	  return;
+	}
+	case 28: // sort28() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,9);Z(1,20);Z(2,21);Z(3,22);Z(4,19);Z(5,24);Z(6,25);Z(7,26);Z(8,23);Z(10,15);Z(11,13);Z(12,17);Z(14,16);Z(18,27);
+		Z(0,18);Z(1,7);Z(2,6);Z(3,5);Z(4,8);Z(9,27);Z(10,12);Z(11,14);Z(13,16);Z(15,17);Z(19,23);Z(20,26);Z(21,25);Z(22,24);
+		Z(1,2);Z(3,4);Z(5,19);Z(6,20);Z(7,21);Z(8,22);Z(9,18);Z(10,11);Z(12,14);Z(13,15);Z(16,17);Z(23,24);Z(25,26);
+		Z(0,3);Z(1,10);Z(5,8);Z(6,7);Z(11,13);Z(14,16);Z(17,26);Z(19,22);Z(20,21);Z(24,27);
+		Z(0,1);Z(2,7);Z(3,10);Z(4,8);Z(12,13);Z(14,15);Z(17,24);Z(19,23);Z(20,25);Z(26,27);
+		Z(1,3);Z(2,6);Z(4,5);Z(7,19);Z(8,20);Z(11,12);Z(13,14);Z(15,16);Z(21,25);Z(22,23);Z(24,26);
+		Z(2,4);Z(5,12);Z(7,8);Z(9,11);Z(10,14);Z(13,17);Z(15,22);Z(16,18);Z(19,20);Z(23,25);
+		Z(2,9);Z(4,11);Z(5,6);Z(7,13);Z(8,10);Z(14,20);Z(16,23);Z(17,19);Z(18,25);Z(21,22);
+		Z(1,2);Z(3,16);Z(4,9);Z(6,12);Z(10,14);Z(11,24);Z(13,17);Z(15,21);Z(18,23);Z(25,26);
+		Z(2,8);Z(3,5);Z(4,7);Z(6,16);Z(9,15);Z(11,21);Z(12,18);Z(19,25);Z(20,23);Z(22,24);
+		Z(2,3);Z(5,8);Z(7,9);Z(11,15);Z(12,16);Z(18,20);Z(19,22);Z(24,25);
+		Z(6,8);Z(10,12);Z(11,13);Z(14,16);Z(15,17);Z(19,21);
+		Z(5,6);Z(8,10);Z(9,11);Z(12,13);Z(14,15);Z(16,18);Z(17,19);Z(21,22);
+		Z(4,5);Z(6,7);Z(8,9);Z(10,11);Z(12,14);Z(13,15);Z(16,17);Z(18,19);Z(20,21);Z(22,23);
+		Z(3,4);Z(5,6);Z(7,8);Z(9,10);Z(11,12);Z(13,14);Z(15,16);Z(17,18);Z(19,20);Z(21,22);Z(23,24);
+	  return;
+	}
+	case 29: // sort29() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,12);Z(1,10);Z(2,9);Z(3,7);Z(5,11);Z(6,8);Z(13,26);Z(14,25);Z(15,28);Z(16,27);Z(17,21);Z(18,19);Z(20,24);Z(22,23);
+		Z(1,6);Z(2,3);Z(4,11);Z(7,9);Z(8,10);Z(13,18);Z(14,20);Z(15,22);Z(16,17);Z(19,26);Z(21,27);Z(23,28);Z(24,25);
+		Z(0,4);Z(1,2);Z(3,6);Z(7,8);Z(9,10);Z(11,12);Z(13,14);Z(15,16);Z(17,18);Z(19,21);Z(20,22);Z(23,24);Z(25,26);Z(27,28);
+		Z(4,6);Z(5,9);Z(8,11);Z(10,12);Z(13,15);Z(14,16);Z(17,23);Z(18,24);Z(19,20);Z(21,22);Z(25,27);Z(26,28);
+		Z(0,5);Z(3,8);Z(4,7);Z(6,11);Z(9,10);Z(14,15);Z(16,25);Z(17,19);Z(18,20);Z(21,23);Z(22,24);Z(26,27);
+		Z(0,1);Z(2,5);Z(6,9);Z(7,8);Z(10,11);Z(14,17);Z(15,19);Z(18,21);Z(20,23);Z(22,26);Z(24,27);
+		Z(0,13);Z(1,3);Z(2,4);Z(5,6);Z(9,10);Z(15,17);Z(16,19);Z(22,25);Z(24,26);
+		Z(1,2);Z(3,4);Z(5,7);Z(6,8);Z(16,18);Z(19,21);Z(20,22);Z(23,25);
+		Z(1,14);Z(2,3);Z(4,5);Z(6,7);Z(8,9);Z(16,17);Z(18,19);Z(20,21);Z(22,23);Z(24,25);
+		Z(2,15);Z(3,4);Z(5,6);Z(10,23);Z(11,24);Z(12,25);Z(19,20);Z(21,22);
+		Z(3,16);Z(4,17);Z(5,18);Z(6,19);Z(7,20);Z(8,21);Z(9,22);Z(10,15);
+		Z(6,10);Z(8,13);Z(9,14);Z(11,16);Z(12,17);Z(18,26);Z(19,27);Z(20,28);
+		Z(4,8);Z(5,9);Z(7,11);Z(12,13);Z(14,18);Z(15,19);Z(16,20);Z(17,21);Z(22,26);Z(23,27);Z(24,28);
+		Z(2,4);Z(3,5);Z(6,8);Z(7,9);Z(10,12);Z(11,14);Z(13,15);Z(16,18);Z(17,19);Z(20,22);Z(21,23);Z(24,26);Z(25,27);
+		Z(1,2);Z(3,4);Z(5,6);Z(7,8);Z(9,10);Z(11,12);Z(13,14);Z(15,16);Z(17,18);Z(19,20);Z(21,22);Z(23,24);Z(25,26);Z(27,28);
+	  return;
+	}
+	case 30: // sort30() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(1,2);Z(3,10);Z(4,14);Z(5,8);Z(6,13);Z(7,12);Z(9,11);Z(16,17);Z(18,25);Z(19,29);Z(20,23);Z(21,28);Z(22,27);Z(24,26);
+		Z(0,14);Z(1,5);Z(2,8);Z(3,7);Z(6,9);Z(10,12);Z(11,13);Z(15,29);Z(16,20);Z(17,23);Z(18,22);Z(21,24);Z(25,27);Z(26,28);
+		Z(0,7);Z(1,6);Z(2,9);Z(4,10);Z(5,11);Z(8,13);Z(12,14);Z(15,22);Z(16,21);Z(17,24);Z(19,25);Z(20,26);Z(23,28);Z(27,29);
+		Z(0,6);Z(2,4);Z(3,5);Z(7,11);Z(8,10);Z(9,12);Z(13,14);Z(15,21);Z(17,19);Z(18,20);Z(22,26);Z(23,25);Z(24,27);Z(28,29);
+		Z(0,3);Z(1,2);Z(4,7);Z(5,9);Z(6,8);Z(10,11);Z(12,13);Z(14,29);Z(15,18);Z(16,17);Z(19,22);Z(20,24);Z(21,23);Z(25,26);Z(27,28);
+		Z(0,1);Z(2,3);Z(4,6);Z(7,9);Z(10,12);Z(11,13);Z(15,16);Z(17,18);Z(19,21);Z(22,24);Z(25,27);Z(26,28);
+		Z(0,15);Z(1,2);Z(3,5);Z(8,10);Z(11,12);Z(13,28);Z(16,17);Z(18,20);Z(23,25);Z(26,27);
+		Z(1,16);Z(3,4);Z(5,6);Z(7,8);Z(9,10);Z(12,27);Z(18,19);Z(20,21);Z(22,23);Z(24,25);
+		Z(2,3);Z(4,5);Z(6,7);Z(8,9);Z(10,11);Z(17,18);Z(19,20);Z(21,22);Z(23,24);Z(25,26);
+		Z(2,17);Z(3,18);Z(4,19);Z(5,6);Z(7,8);Z(9,24);Z(10,25);Z(11,26);Z(20,21);Z(22,23);
+		Z(5,20);Z(6,21);Z(7,22);Z(8,23);Z(9,16);Z(10,17);Z(11,18);Z(12,19);
+		Z(5,9);Z(6,10);Z(7,11);Z(8,15);Z(13,20);Z(14,21);Z(18,22);Z(19,23);
+		Z(3,5);Z(4,8);Z(7,9);Z(12,15);Z(13,16);Z(14,17);Z(20,24);Z(21,25);
+		Z(2,4);Z(6,8);Z(10,12);Z(11,13);Z(14,15);Z(16,18);Z(17,19);Z(20,22);Z(21,23);Z(24,26);Z(25,27);
+		Z(1,2);Z(3,4);Z(5,6);Z(7,8);Z(9,10);Z(11,12);Z(13,14);Z(15,16);Z(17,18);Z(19,20);Z(21,22);Z(23,24);Z(25,26);Z(27,28);
+	  return;
+	}
+	case 31: // sort31() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,1);Z(2,3);Z(4,5);Z(6,7);Z(8,9);Z(10,11);Z(12,13);Z(14,15);Z(16,17);Z(18,19);Z(20,21);Z(22,23);Z(24,25);Z(26,27);Z(28,29);
+		Z(0,2);Z(1,3);Z(4,6);Z(5,7);Z(8,10);Z(9,11);Z(12,14);Z(13,15);Z(16,18);Z(17,19);Z(20,22);Z(21,23);Z(24,26);Z(25,27);Z(28,30);
+		Z(0,4);Z(1,5);Z(2,6);Z(3,7);Z(8,12);Z(9,13);Z(10,14);Z(11,15);Z(16,20);Z(17,21);Z(18,22);Z(19,23);Z(24,28);Z(25,29);Z(26,30);
+		Z(0,8);Z(1,9);Z(2,10);Z(3,11);Z(4,12);Z(5,13);Z(6,14);Z(7,15);Z(16,24);Z(17,25);Z(18,26);Z(19,27);Z(20,28);Z(21,29);Z(22,30);
+		Z(0,16);Z(1,8);Z(2,4);Z(3,12);Z(5,10);Z(6,9);Z(7,14);Z(11,13);Z(17,24);Z(18,20);Z(19,28);Z(21,26);Z(22,25);Z(23,30);Z(27,29);
+		Z(1,2);Z(3,5);Z(4,8);Z(6,22);Z(7,11);Z(9,25);Z(10,12);Z(13,14);Z(17,18);Z(19,21);Z(20,24);Z(23,27);Z(26,28);Z(29,30);
+		Z(1,17);Z(2,18);Z(3,19);Z(4,20);Z(5,10);Z(7,23);Z(8,24);Z(11,27);Z(12,28);Z(13,29);Z(14,30);Z(21,26);
+		Z(3,17);Z(4,16);Z(5,21);Z(6,18);Z(7,9);Z(8,20);Z(10,26);Z(11,23);Z(13,25);Z(14,28);Z(15,27);Z(22,24);
+		Z(1,4);Z(3,8);Z(5,16);Z(7,17);Z(9,21);Z(10,22);Z(11,19);Z(12,20);Z(14,24);Z(15,26);Z(23,28);Z(27,30);
+		Z(2,5);Z(7,8);Z(9,18);Z(11,17);Z(12,16);Z(13,22);Z(14,20);Z(15,19);Z(23,24);Z(26,29);
+		Z(2,4);Z(6,12);Z(9,16);Z(10,11);Z(13,17);Z(14,18);Z(15,22);Z(19,25);Z(20,21);Z(27,29);
+		Z(5,6);Z(8,12);Z(9,10);Z(11,13);Z(14,16);Z(15,17);Z(18,20);Z(19,23);Z(21,22);Z(25,26);
+		Z(3,5);Z(6,7);Z(8,9);Z(10,12);Z(11,14);Z(13,16);Z(15,18);Z(17,20);Z(19,21);Z(22,23);Z(24,25);Z(26,28);
+		Z(3,4);Z(5,6);Z(7,8);Z(9,10);Z(11,12);Z(13,14);Z(15,16);Z(17,18);Z(19,20);Z(21,22);Z(23,24);Z(25,26);Z(27,28);
+	  return;
+	}
+	case 32: // sort32() from http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+	{   Z(0,1);Z(2,3);Z(4,5);Z(6,7);Z(8,9);Z(10,11);Z(12,13);Z(14,15);Z(16,17);Z(18,19);Z(20,21);Z(22,23);Z(24,25);Z(26,27);Z(28,29);Z(30,31);
+		Z(0,2);Z(1,3);Z(4,6);Z(5,7);Z(8,10);Z(9,11);Z(12,14);Z(13,15);Z(16,18);Z(17,19);Z(20,22);Z(21,23);Z(24,26);Z(25,27);Z(28,30);Z(29,31);
+		Z(0,4);Z(1,5);Z(2,6);Z(3,7);Z(8,12);Z(9,13);Z(10,14);Z(11,15);Z(16,20);Z(17,21);Z(18,22);Z(19,23);Z(24,28);Z(25,29);Z(26,30);Z(27,31);
+		Z(0,8);Z(1,9);Z(2,10);Z(3,11);Z(4,12);Z(5,13);Z(6,14);Z(7,15);Z(16,24);Z(17,25);Z(18,26);Z(19,27);Z(20,28);Z(21,29);Z(22,30);Z(23,31);
+		Z(0,16);Z(1,8);Z(2,4);Z(3,12);Z(5,10);Z(6,9);Z(7,14);Z(11,13);Z(15,31);Z(17,24);Z(18,20);Z(19,28);Z(21,26);Z(22,25);Z(23,30);Z(27,29);
+		Z(1,2);Z(3,5);Z(4,8);Z(6,22);Z(7,11);Z(9,25);Z(10,12);Z(13,14);Z(17,18);Z(19,21);Z(20,24);Z(23,27);Z(26,28);Z(29,30);
+		Z(1,17);Z(2,18);Z(3,19);Z(4,20);Z(5,10);Z(7,23);Z(8,24);Z(11,27);Z(12,28);Z(13,29);Z(14,30);Z(21,26);
+		Z(3,17);Z(4,16);Z(5,21);Z(6,18);Z(7,9);Z(8,20);Z(10,26);Z(11,23);Z(13,25);Z(14,28);Z(15,27);Z(22,24);
+		Z(1,4);Z(3,8);Z(5,16);Z(7,17);Z(9,21);Z(10,22);Z(11,19);Z(12,20);Z(14,24);Z(15,26);Z(23,28);Z(27,30);
+		Z(2,5);Z(7,8);Z(9,18);Z(11,17);Z(12,16);Z(13,22);Z(14,20);Z(15,19);Z(23,24);Z(26,29);
+		Z(2,4);Z(6,12);Z(9,16);Z(10,11);Z(13,17);Z(14,18);Z(15,22);Z(19,25);Z(20,21);Z(27,29);
+		Z(5,6);Z(8,12);Z(9,10);Z(11,13);Z(14,16);Z(15,17);Z(18,20);Z(19,23);Z(21,22);Z(25,26);
+		Z(3,5);Z(6,7);Z(8,9);Z(10,12);Z(11,14);Z(13,16);Z(15,18);Z(17,20);Z(19,21);Z(22,23);Z(24,25);Z(26,28);
+		Z(3,4);Z(5,6);Z(7,8);Z(9,10);Z(11,12);Z(13,14);Z(15,16);Z(17,18);Z(19,20);Z(21,22);Z(23,24);Z(25,26);Z(27,28);
+	  return;
+	}
+   } // end of switch for optimal sort
+ }
+ #undef Z
+
+#endif
diff --git a/ya-sort2.c b/ya-sort2.c
new file mode 100644
index 0000000..5d8c8a5
--- /dev/null
+++ b/ya-sort2.c
@@ -0,0 +1,1182 @@
+/* ya-sort2.c
+   =========
+  yasort2() sorts an array of numbers and moves the contents of another array of the same size around so the "pairs" are kept together.
+  Eg if one array has x co-ords and the 2nd has y co-ords you can sort the x co-ords while keeping the x-y pairs together at the same array index.
+   
+  It uses a quicksort with the addition of introsort functionality to give both a fast average execution time and o(n*log(n)) worse case execution time,
+  see "Introspective sorting and selection algorithms" by D.R.Musser,Software practice and experience, 8:983-993, 1997. 
+  
+  ya2sort can only sort numbers.
+
+  
+  yasort2() takes ~ 71% longer than yasort() which is not surprising as it has to move twice the amount of data around.
+  
+ It uses yasort2.h to define the type of data that will be sorted ( elem_type_sort2 ).
+
+ If PAR_SORT is defined (see below) then this code uses tasks to use all available processors to speed up sorting
+ On a simple test enabling this with a 2 processor system sped up sorting by 1.5*
+*/ 
+/* 
+Copyright (c) 2021,2022 Peter Miller
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef __BORLANDC__
+ #define NDEBUG /* if defined then asserts become "nothing" */
+#endif
+// #define YA2SORT_TEST_PROGRAM /* if defined compile a test program for this code - make sure that YASORT_MEDIAN_TEST_PROGRAM is NOT defined in ya-sort.c ! */
+							 /* For the test program, you also need to ensure the definitions of elem_type_median (in yamedian.h) and elem_type_sort2 (in yasort2.h) are the same (normally double is used for testing) */
+#define PAR_SORT /* if defined create a parallel sort using threads - this uses windows threads - see https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/beginthread-beginthreadex?view=msvc-170 */
+// #define DEBUG /* if defined then add a few printf's so you can see whats happening, can be useful if INTROSORT_MULT needs to be tuned, but oherwise not needed */
+// you should not need to edit anything below here for normal use of this software (you may need to edit yasort2.h)
+
+
+#include <time.h>
+#include <stdint.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <string.h>
+#if defined(DEBUG) || defined (YA2SORT_TEST_PROGRAM )
+ #include <stdio.h>
+ #include <time.h>
+ #include "hr_timer.h"
+#endif
+#include <assert.h>
+#ifdef PAR_SORT
+#include <process.h> /* for _beginthreadex */
+#include <windows.h> /* for number of processors */
+#endif
+
+#include "yasort2.h"  /* defines elem_type_sort2 etc */
+
+#define INTROSORT_MULT 2 /* defines when we swap to heapsort 3 means only do so very rarely, 0 means "always" use heapsort. All positive integer values (including 0) will give o(n*log(n)) worse case execution time
+							on the 1st step of the test program yasort2(100000001) sorting doubles the following execution speeds were obtained
+							with DUAL_PARTITION on yasort2 we get
+							INTROSORT_MULT  Time(secs)
+							0                       - heapsort already used (slow).
+							1				112.687 - heapsort never used, mid-range used more than at 2
+							2				112.907 - heapsort never used, mid-range used for some examples
+							3				112.925 - heapsort never used, mid range used fo 1 example
+						 */
+
+
+#ifdef DEBUG
+#define dprintf(...) printf(__VA_ARGS__) /* convert to printf when DEBUG defined */ 
+#else
+#define dprintf(...) /* nothing - only prints when DEBUG defined */
+#endif
+
+#define P_UNUSED(x) (void)x /* a way to avoid warning unused parameter messages from the compiler */
+#if defined(YA2SORT_TEST_PROGRAM) || !defined(NDEBUG)
+ static bool check_sort( elem_type_sort2 *a, size_t n); // check result of sort is ordered correctly
+#endif
+static void heapsort2(elem_type_sort2 *a,elem_type_sort2 *b,size_t n); // backup sort automatically used when required.
+
+#if defined __GNUC__
+
+static inline int ilog2(size_t x) { return 63 - __builtin_clzll(x); }
+
+#elif defined _WIN32  && !defined  __BORLANDC__
+#include <intrin.h>
+static inline int ilog2(size_t x)
+{
+    unsigned long i = 0;
+     _BitScanReverse(&i, x);
+    return i;
+}
+
+#elif defined _WIN64  && !defined  __BORLANDC__
+#include <intrin.h>
+static inline int ilog2(size_t x)
+{
+    unsigned long i = 0;
+    _BitScanReverse64(&i, x);
+    return i;
+}
+#else // version in standard C , this is slower than above optimised versions but portable
+/* algorithm from https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers */
+#if (defined __BORLANDC__ && defined _WIN64) || ( defined __SIZEOF_POINTER__ &&  __SIZEOF_POINTER__ == 8)
+static inline int  ilog2(size_t x) // unsigned 64 bit version 
+{
+  #define S(k) if (x >= (UINT64_C(1) << k)) { i += k; x >>= k; }
+  int i = 0;
+  S(32);
+  S(16);
+  S(8);
+  S(4);
+  S(2);
+  S(1);
+  return i;
+  #undef S
+}
+#elif (defined __BORLANDC__ && defined __WIN32__) || ( defined __SIZEOF_POINTER__ && __SIZEOF_POINTER__ == 4 )
+static inline int  ilog2(size_t x) // unsigned 32 bit version 
+{
+  #define S(k) if (x >= (UINT32_C(1) << k)) { i += k; x >>= k; }
+  int i = 0;
+  S(16);
+  S(8);
+  S(4);
+  S(2);
+  S(1);
+  return i;
+  #undef S
+}
+#else
+#error "unknown pointer size - expected 4 (32 bit) or 8 (64 bit) "
+#endif
+
+#endif
+
+
+#define cswap(i,j) {elem_type_sort2 _t;_t=(i);i=(j);j=_t;} // example call cswap(r[i], r[minIndex]); WARNING - side effects could be an issue here ! eg cswap(r[i++],r[b])
+/* note that an intelligent swap (that only writes back if the value is different) is slightly slower in the test program */
+inline static void eswap1(size_t i,size_t j,elem_type_sort2 *a) /* swap a[i]  and a[j]  */
+	{  
+	 cswap(a[i],a[j]); // use swap macro
+	} 
+inline static void eswap2(size_t i,size_t j,elem_type_sort2 *a,elem_type_sort2 *b) /* swap a[i]  and a[j] also swap b[i] and b[j] */
+	{  
+	 cswap(a[i],a[j]); // use swap macro on array a
+	 cswap(b[i],b[j]); // use swap macro on array b
+	} 
+
+inline static void eswap2p(elem_type_sort2 *pi,elem_type_sort2 *pj,elem_type_sort2 *a,elem_type_sort2 *b) /* swap pi  and pj also swap b[i] and b[j] */
+	{  
+	 cswap(*pi,*pj); // use swap macro on array a
+	 cswap(*(pi-a+b),*(pj-a+b)); // use swap macro on array b
+	} 
+
+#define elem_type_ss elem_type_sort2 /* set type for smallsort correctly */	
+#define _SORT2 /* tell smallsort we have 2 arrays */
+#include "ya-smallsort.h" // contains small_sort() - this needs to be included after cswap is defined
+	
+#ifdef YA2SORT_TEST_PROGRAM
+#ifdef RAND_MAX
+ #undef RAND_MAX
+#endif
+#define RAND_MAX INT_MAX /* in case its used */
+
+static int bi_modal_rand(void) /* returns a random number with a "hole" in the distribution so the mean & (max-min)/2 are significantly different to the median  */
+{int r=rand();
+ while(r>RAND_MAX/4 && r<RAND_MAX/2) r=rand(); // skip values that fall in the "hole" from RAND_MAX/4 to RAND_MAX/2. "hole" is off centre to make distribution non-symetrical. 
+ return r;
+}
+#endif
+
+#ifndef min /* with gcc this is already defined */
+ #define min(a, b)	(a) < (b) ? (a) : (b) /* warning a,b evaluated more than once ! */
+#endif
+ 
+#if defined(YA2SORT_TEST_PROGRAM) || !defined(NDEBUG)
+static  bool check_sort( elem_type_sort2 *a, size_t n) // check result of sort is ordered correctly
+{// returns true if sort is ok
+ /* check array actually is sorted correctly in increasing order */
+ size_t errs=0; 
+ if(n<2) return true;
+ for(size_t i=0;i<n-1;++i)
+	 if(a[i+1]<a[i]) 
+		errs++;		
+  return (errs==0);
+}
+#endif
+
+/*
+* From:
+* Fast median search: an ANSI C implementation
+* Nicolas Devillard - ndevilla AT free DOT fr
+* July 1998
+*
+* The following routines have been built from knowledge gathered
+* around the Web. I am not aware of any copyright problem with
+* them, so use it as you want.
+* N. Devillard - 1998
+* The functions below have been edited by Peter Miller 9/2021 to fit in with the rest of the code here.
+*/
+#define Zv(a,b) { if ((a)>(b)) cswap((a),(b)); }
+ 
+
+/*----------------------------------------------------------------------------
+Function : opt_med9()
+In : pointer to an array of 9 elem_type_sort2 values
+Out : an elem_type_sort2
+Job : optimized search of the median of 9 elem_type_sort2 values
+Notice : in theory, cannot go faster without assumptions on the signal.
+Formula from:
+XILINX XCELL magazine, vol. 23 by John L. Smith
+The input array is modified in the process
+The result array is guaranteed to contain the median
+value in middle position, but other elements are NOT fully sorted sorted.
+---------------------------------------------------------------------------*/
+static elem_type_sort2 opt_med9(elem_type_sort2 * p)
+{
+Zv(p[1], p[2]) ; Zv(p[4], p[5]) ; Zv(p[7], p[8]) ;
+Zv(p[0], p[1]) ; Zv(p[3], p[4]) ; Zv(p[6], p[7]) ;
+Zv(p[1], p[2]) ; Zv(p[4], p[5]) ; Zv(p[7], p[8]) ;
+Zv(p[0], p[3]) ; Zv(p[5], p[8]) ; Zv(p[4], p[7]) ;
+Zv(p[3], p[6]) ; Zv(p[1], p[4]) ; Zv(p[2], p[5]) ;
+Zv(p[4], p[7]) ; Zv(p[4], p[2]) ; Zv(p[6], p[4]) ;
+Zv(p[4], p[2]) ; return(p[4]) ;
+}
+
+/*----------------------------------------------------------------------------
+Function : opt_med25()
+In : pointer to an array of 25 elem_type_sort2 values
+Out : an elem_type_sort2
+Job : optimized search of the median of 25 elem_type_sort2 values
+The input array is modified in the process
+The result array is guaranteed to contain the median
+value in middle position, but other elements are NOT fully sorted sorted.
+Notice : in theory, cannot go faster without assumptions on the signal.
+Code taken from Graphic Gems.
+---------------------------------------------------------------------------*/
+static elem_type_sort2 opt_med25(elem_type_sort2 * p)
+{
+Zv(p[0], p[1]) ; Zv(p[3], p[4]) ; Zv(p[2], p[4]) ;
+Zv(p[2], p[3]) ; Zv(p[6], p[7]) ; Zv(p[5], p[7]) ;
+Zv(p[5], p[6]) ; Zv(p[9], p[10]) ; Zv(p[8], p[10]) ;
+Zv(p[8], p[9]) ; Zv(p[12], p[13]) ; Zv(p[11], p[13]) ;
+Zv(p[11], p[12]) ; Zv(p[15], p[16]) ; Zv(p[14], p[16]) ;
+Zv(p[14], p[15]) ; Zv(p[18], p[19]) ; Zv(p[17], p[19]) ;
+Zv(p[17], p[18]) ; Zv(p[21], p[22]) ; Zv(p[20], p[22]) ;
+Zv(p[20], p[21]) ; Zv(p[23], p[24]) ; Zv(p[2], p[5]) ;
+Zv(p[3], p[6]) ; Zv(p[0], p[6]) ; Zv(p[0], p[3]) ;
+Zv(p[4], p[7]) ; Zv(p[1], p[7]) ; Zv(p[1], p[4]) ;
+Zv(p[11], p[14]) ; Zv(p[8], p[14]) ; Zv(p[8], p[11]) ;
+Zv(p[12], p[15]) ; Zv(p[9], p[15]) ; Zv(p[9], p[12]) ;
+Zv(p[13], p[16]) ; Zv(p[10], p[16]) ; Zv(p[10], p[13]) ;
+Zv(p[20], p[23]) ; Zv(p[17], p[23]) ; Zv(p[17], p[20]) ;
+Zv(p[21], p[24]) ; Zv(p[18], p[24]) ; Zv(p[18], p[21]) ;
+Zv(p[19], p[22]) ; Zv(p[8], p[17]) ; Zv(p[9], p[18]) ;
+Zv(p[0], p[18]) ; Zv(p[0], p[9]) ; Zv(p[10], p[19]) ;
+Zv(p[1], p[19]) ; Zv(p[1], p[10]) ; Zv(p[11], p[20]) ;
+Zv(p[2], p[20]) ; Zv(p[2], p[11]) ; Zv(p[12], p[21]) ;
+Zv(p[3], p[21]) ; Zv(p[3], p[12]) ; Zv(p[13], p[22]) ;
+Zv(p[4], p[22]) ; Zv(p[4], p[13]) ; Zv(p[14], p[23]) ;
+Zv(p[5], p[23]) ; Zv(p[5], p[14]) ; Zv(p[15], p[24]) ;
+Zv(p[6], p[24]) ; Zv(p[6], p[15]) ; Zv(p[7], p[16]) ;
+Zv(p[7], p[19]) ; Zv(p[13], p[21]) ; Zv(p[15], p[23]) ;
+Zv(p[7], p[13]) ; Zv(p[7], p[15]) ; Zv(p[1], p[9]) ;
+Zv(p[3], p[11]) ; Zv(p[5], p[17]) ; Zv(p[11], p[17]) ;
+Zv(p[9], p[17]) ; Zv(p[4], p[10]) ; Zv(p[6], p[12]) ;
+Zv(p[7], p[14]) ; Zv(p[4], p[6]) ; Zv(p[4], p[7]) ;
+Zv(p[12], p[14]) ; Zv(p[10], p[14]) ; Zv(p[6], p[7]) ;
+Zv(p[10], p[12]) ; Zv(p[6], p[10]) ; Zv(p[6], p[17]) ;
+Zv(p[12], p[17]) ; Zv(p[7], p[17]) ; Zv(p[7], p[10]) ;
+Zv(p[12], p[18]) ; Zv(p[7], p[12]) ; Zv(p[10], p[18]) ;
+Zv(p[12], p[20]) ; Zv(p[10], p[20]) ; Zv(p[10], p[12]) ;
+return (p[12]);
+}
+/* end of N. Devillard functions */
+
+/* 
+   heapsort() is used within yasort2 as part of its introsort functionality (ie when it struggles to obtain O(n*log(n)) execution time).
+   Heapsort has a guaranteed O(n*log(n)) execution time. 
+   It is however a LOT slower than yasort2 on the test suite (~ 6* based on the total test runtime)
+   Note that in practice heapsort is normally used very little so its slower speed is not an issue.  
+   The advantage of using a heapsort here is that it makes the sorting code completely independent of the median code,
+   ya_msort (in ya-select.c) could be used in its place and it a lot faster - but this makes no difference to the overall execution time of yasort2().    
+*/
+
+// heapsort based on code at http://www.codecodex.com/wiki/Heapsort#C.2FC.2B.2B - which itself is based on the numerical recipees algorithm 
+// This version supports elem_type_sort2 (ie will sort any type that can be compared ).
+// This sorts array a[] keeping values in b[] in the same relative place
+
+static void heapsort2 (elem_type_sort2 *a,elem_type_sort2 *b, size_t n) 
+  	{
+     size_t  i = n/2, parent, child;
+     elem_type_sort2 t,t2;
+     for (;;) 
+	   	{ /* Loops until a[] is sorted */
+         if (i > 0) 
+		  	 { /* First stage - creating the heap , highest value will end up in a[0] */
+              i--;          
+              t = a[i];    /* Save parent value to t */
+              t2= b[i];
+          	 } 
+		  else 
+		   	 {     /* Second stage (when i==0) - Extracting elements in-place */
+              n--;           /* Make the new heap smaller */
+              if (n == 0) return; /* When the heap is empty, we are done */
+              t = a[n];    /* Save last value (it will be overwritten on the line below) */
+              a[n] = a[0]; /* Save largest value at the end of arr */
+              t2= b[n];    
+              b[n] = b[0];              
+          	 }
+  
+         parent = i; /* We will start pushing down t from parent */
+         child = parent*2 + 1; /* parent's left child */
+  
+          /* Sift operation - pushing the value of t down the heap */
+         while (child < n) 
+		  	 { // Choose the largest child - assume left child and adjust if its the right one
+              if (child + 1 < n  &&  a[child + 1] > a[child]) 
+			  	 {
+                  child++; /*  right child is the largest */
+              	 }
+              if (a[child] > t) 
+			  	 { /* If any child is bigger than the parent */
+                  a[parent] = a[child]; /* Move the largest child up */
+                  b[parent] = b[child];
+                  parent = child; /* Move parent pointer to this child */
+                  child = parent*2+1; 
+              	 } 
+				else 
+				 {
+                  break; /* t's place is found */
+              	 }
+          	 }
+         a[parent] = t; /* We save t in the heap */
+         b[parent] = t2; 
+     	}
+  	}
+
+
+
+// This is a version of quicksort that only partitions into 2 partitions (<= pivot, >= pivot) so inner loop has a simpler structure than one that splits into 3 partitions
+// The inner quicksort loop does not degrade into a O(n^2) when there are duplicate values in the array to be sorted (many partitioning schemes do).
+// It uses introsort techniques to achieve O(n*log2(n)) worse case runtime.
+// This version was created by Peter Miller 9/1/2022
+static void _yasort2(elem_type_sort2 *x,elem_type_sort2 *y, size_t n,int nos_p); /* main worker function */
+
+#ifdef PAR_SORT /* helper code for Parallel version */
+
+#define PAR_DIV_N 16 /* divisor on n (current partition size) to check size of partition about to be spawned as a new task is big enough to justify the work of creating a new task */
+#define PAR_MIN_N 10000 /* min size of a partition to be spawned as a new task */
+
+struct _params 
+	{elem_type_sort2 *xp;
+	 elem_type_sort2 *yp;
+	 size_t np;
+	 int nos_p_p;
+	};
+	
+unsigned __stdcall yasortThreadFunc( void * _Arg ) // parallel thread that can sort a partition
+{struct _params* Arg=_Arg;
+ _yasort2(Arg->xp,Arg->yp,Arg->np,Arg->nos_p_p); // sort required section 
+ _endthreadex( 0 );
+ return 0;
+}
+
+/* we ideally need to know the number of processors - the code below gets this for windows */
+/* this is from https://docs.microsoft.com/de-de/windows/win32/api/sysinfoapi/nf-sysinfoapi-getlogicalprocessorinformation with minor changes by Peter Miller */
+// see https://programmerall.com/article/52652219244/ for Linux versions
+
+typedef BOOL (WINAPI *LPFN_GLPI)(
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, 
+    PDWORD);
+
+
+// Helper function to count set bits in the processor mask.
+static DWORD CountSetBits(ULONG_PTR bitMask)
+{
+    DWORD LSHIFT = sizeof(ULONG_PTR)*8 - 1;
+    DWORD bitSetCount = 0;
+    ULONG_PTR bitTest = (ULONG_PTR)1 << LSHIFT;    
+    DWORD i;
+    
+    for (i = 0; i <= LSHIFT; ++i)
+    {
+        bitSetCount += ((bitMask & bitTest)?1:0);
+        bitTest/=2;
+    }
+
+    return bitSetCount;
+}
+
+#ifdef YA2SORT_TEST_PROGRAM  /* only needed for the test program */
+static void proc_info(void)
+{
+    BOOL done = FALSE;
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL;
+    DWORD returnLength = 0;
+    DWORD logicalProcessorCount = 0;
+    DWORD numaNodeCount = 0;
+    DWORD processorCoreCount = 0;
+    DWORD processorL1CacheCount = 0;
+    DWORD processorL2CacheCount = 0;
+    DWORD processorL3CacheCount = 0;
+    DWORD processorPackageCount = 0;
+    DWORD byteOffset = 0;
+    PCACHE_DESCRIPTOR Cache;
+    while (!done)  /* we need to call GetLogicalProcessorInformation() twice, the 1st time it tells us how big a buffer we need to supply */
+    {
+        DWORD rc = GetLogicalProcessorInformation(buffer, &returnLength);
+
+        if (FALSE == rc) 
+        {
+            if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) 
+            {
+                if (buffer) 
+                    free(buffer);
+
+                buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(
+                        returnLength);
+
+                if (NULL == buffer) 
+                {
+                    printf("\nError: Allocation failure\n");
+                    return ;
+                }
+            } 
+            else 
+            {
+                printf("\nError %d\n", (int)GetLastError());
+                return ;
+            }
+        } 
+        else
+        {
+            done = TRUE;
+        }
+    }
+
+    ptr = buffer;
+
+    while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) 
+    {
+        switch (ptr->Relationship) 
+        {
+        case RelationNumaNode:
+            // Non-NUMA systems report a single record of this type.
+            numaNodeCount++;
+            break;
+
+        case RelationProcessorCore:
+            processorCoreCount++;
+
+            // A hyperthreaded core supplies more than one logical processor.
+            logicalProcessorCount += CountSetBits(ptr->ProcessorMask);
+            break;
+
+        case RelationCache:
+            // Cache data is in ptr->Cache, one CACHE_DESCRIPTOR structure for each cache. 
+            Cache = &ptr->Cache;
+            if (Cache->Level == 1)
+            {
+                processorL1CacheCount++;
+            }
+            else if (Cache->Level == 2)
+            {
+                processorL2CacheCount++;
+            }
+            else if (Cache->Level == 3)
+            {
+                processorL3CacheCount++;
+            }
+            break;
+
+        case RelationProcessorPackage:
+            // Logical processors share a physical package.
+            processorPackageCount++;
+            break;
+
+        default:
+            printf("\nError: Unsupported LOGICAL_PROCESSOR_RELATIONSHIP value.\n");
+            break;
+        }
+        byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+        ptr++;
+    }
+
+    printf("\n GetLogicalProcessorInformation results:\n");
+    printf("  Number of NUMA nodes: %d\n", 
+             (int)numaNodeCount);
+    printf("  Number of physical processor packages: %d\n", 
+             (int)processorPackageCount);
+    printf("  Number of processor cores: %d\n", 
+             (int)processorCoreCount);
+    printf("  Number of logical processors: %d\n", 
+             (int)logicalProcessorCount);
+    printf("  Number of processor L1/L2/L3 caches: %d/%d/%d\n", 
+             (int)processorL1CacheCount,
+             (int)processorL2CacheCount,
+             (int)processorL3CacheCount);  
+    free(buffer);
+
+    return;
+}
+#endif
+
+static int nos_procs(void) /* return the number of logical processors present. 
+					   This was created by Peter Miller 15/1/2022 based on above code  */
+{
+    BOOL done = FALSE;
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL;
+    DWORD returnLength = 0;
+    DWORD logicalProcessorCount = 0;
+    DWORD processorCoreCount = 0;
+    DWORD byteOffset = 0; 
+
+    while (!done) /* we need to call GetLogicalProcessorInformation() twice, the 1st time it tells us how big a buffer we need to supply */
+    {
+        DWORD rc = GetLogicalProcessorInformation(buffer, &returnLength);
+
+        if (FALSE == rc) 
+        {
+            if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) 
+            {
+                if (buffer) 
+                    free(buffer);
+
+                buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(
+                        returnLength);
+
+                if (NULL == buffer) 
+                {
+                    // Error: memoery Allocation failure
+                    return 0;
+                }
+            } 
+            else 
+            {  // other (unexpected) error
+                return 0;
+            }
+        } 
+        else
+        {
+            done = TRUE;
+        }
+    }
+
+    ptr = buffer;
+
+    while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength) 
+    {
+        if (ptr->Relationship == RelationProcessorCore) 
+        {
+            processorCoreCount++;
+            // A hyperthreaded core supplies more than one logical processor.
+            logicalProcessorCount += CountSetBits(ptr->ProcessorMask);
+        }
+        byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+        ptr++;
+    }
+
+    free(buffer);
+
+    return (int)logicalProcessorCount; // actual number of processor cores is processorCoreCount
+}
+
+#endif
+	
+static void _yasort2(elem_type_sort2 *x,elem_type_sort2 *y, size_t n,int nos_p ) /* main worker function, nos_p is nos processors available */
+{
+ bool need_max=true,need_min=true;
+ elem_type_sort2  min, max;  
+ if(n<=1) return; // avoid trying to take log2 of 0 - anyway an array of length 1 is already sorted  
+ int itn=0;
+ const int max_itn=INTROSORT_MULT*ilog2(n); // max_itn defines point we swap to mid-range pivot, then at 2*max_int we swap to heapsort. if INTROSORT_MULT=0 then "always" use heapsort, 3 means "almost never" use heapsort
+#ifdef PAR_SORT
+ struct _params params;
+ HANDLE th=NULL; // handle for worker thread
+#else
+ P_UNUSED(nos_p); // this param is not used unless PAR_SORT is defined
+#endif	 
+ while(1) // replace tail recursion with a loop
+ 	{
+	 if (n <= 1) goto sortend; // need a common end point as may be using threads in which case we need to wait for them to complete
+	 /* Use a simple (fast) sort for small n [ this is normally an optimal sort (so use n<=32) ] Using small_sort2() is ~ 6% faster than using the insertion sort below for n<=32 */
+	 if(n<=32)
+	   		{   			
+		 	 small_sort2( x,y, n);	
+		 	 assert(check_sort( x, n) ); // check result of sort is ordered correctly
+		 	 goto sortend; // need a common end point as may be using threads in which case we need to wait for them to complete
+	    	}    	
+	 // next try an insertion sort, abort this if its taking too much effort
+	 // this efficiently sorts arrays that are almost perfectly sorted already.
+	 // using this gives an average 33% speedup using the test program when MAX_INS_MOVES=2
+	 // There is also  a potentially useful side effect of this, if we have to move to using quicksort then 2 values will have been moved which could help break up bad patterns of data.		 
+	 #define MAX_INS_MOVES 2 /* max allowed number of moves - for the test program 2 is the optimum value */
+	 size_t nos_ins_moves=0;	 
+	 for (elem_type_sort2 *p=x,*p2=y; p<x+n-1; ++p,++p2) 
+			{
+			 elem_type_sort2 t = p[1],t2=p2[1];
+			 elem_type_sort2  *j = p,*j2=p2;
+			 if(*j>t)
+			 	{// out of order 
+			 	 if(++nos_ins_moves>MAX_INS_MOVES) goto do_qsort; // too many moves  - swap to qsort	
+			 	 do
+			 		{j--;j2--;
+			 		} while(j>=x && *j>t);
+			 	 memmove(j+2,j+1,(p-j)*sizeof(elem_type_sort2));// move a portion of array x right by 1 to make space for t 		 
+			 	 j[1]=t;
+			 	 memmove(j2+2,j2+1,(p2-j2)*sizeof(elem_type_sort2));// move a portion of array y right by 1 to make space for t2 		 
+			 	 j2[1]=t2;			  		
+			 	}
+	        }			
+	 assert(check_sort( x, n) ); // check result of sort is ordered correctly
+	 goto sortend; // need a common end point as may be using threads in which case we need to wait for them to complete
+	  
+	do_qsort:  ; // ; as label can only be attached to a statement and the line below is a declaration ... 
+
+   // if we have made too many iterations of this while loop then we need to swap to heapsort 
+   if(++itn>2*max_itn)
+  		{	 
+#ifdef DEBUG 	  
+	  	 printf("yasort2: using heapsort(%.0f)\n",(double)n);
+#endif	  
+	  	 heapsort2(x,y,n);
+	  	 assert(check_sort( x, n) ); // check result of sort is ordered correctly
+  	  	 goto sortend; // need a common end point as may be using threads in which case we need to wait for them to complete
+  		}
+	/* start of quicksort */
+	/* The partitioning algorithm used here partitions into 2 groups, <= pivot and >= pivot 
+	   The code can also create a partition = pivot in some cases but this partition does not include all values = pivot
+	*/
+	/* select pivot into v - if a large number of elements use median of 25 elements otherwise use median of 9 elements.
+	   if we have made > max_itn loops then we use the mid-range for the pivot.
+	   If we get here we know array has >32 elements in it 
+	*/
+	elem_type_sort2 v; // pivot value
+	if(itn>max_itn)
+		 { /* using medians to select the pivot has failed, swap to using the mid-range ( (max-min)/2 ) as the pivot value.
+		 	  This is the pivot value used by the Torben algorithm - see ya-median.c for more details.
+ 			  Because this pivot selection method looks at all the values in x[] it always makes some progress and is not sensitive to the "pattern" of the data in x[]
+ 			  You could also consider this as a radix exchange sort (especially if sorting integers) - see "The art of concurrency, A thread monkey's guide to writing parallel applications", Clay Breshears, 2009, page 182-3 
+			  A radix exchange sort has a linear time [ O(n) ] asymptotic complexity. 			  
+		   */
+#ifdef DEBUG 	  
+	  	   printf("yasort2: using mid-range as pivot (%.0f)\n",(double)n);
+#endif	 		   
+		   if(need_max && need_min)
+		    	{min = max = x[0] ; 
+			     for (elem_type_sort2 *p=x+1; p<x+n; p++) 
+					{// find min & max
+			    	 elem_type_sort2 t=*p;
+			         if (t<min) min=t;
+			         if (t>max) max=t; 
+			    	}
+			    }
+			else if(need_max)
+		    	{max = x[0] ; 
+			     for (elem_type_sort2 *p=x+1; p<x+n; p++) 
+					{//  find max
+			    	 elem_type_sort2 t=*p;
+			         if (t>max) max=t; 
+			    	}
+			    }	
+			else if(need_min)/* need_min */
+		    	{min =  x[0] ; 
+			     for (elem_type_sort2 *p=x+1; p<x+n; p++) 
+					{//  find min
+			    	 elem_type_sort2 t=*p;
+			         if (t<min) min=t;
+			    	}
+			    }	    
+		 	if(max==min) 
+		 	 	{
+			 	 // only 1 value - so nothing left to sort. This should never happen as the insertion sort would have trapped this and already returned...
+			 	 goto sortend; // need a common end point as may be using threads in which case we need to wait for them to complete
+			 	}    	
+		 	else
+		 		{
+				 v= min/2+max/2; // reasonable estimate of median, so use as pivot.		
+				}
+   		 }	
+	else if(n>10000 )
+		{// large number of elements then take median of 25 values
+	 	 // use 1st 5 middle 5 and last 5 and two sets of 5 inbetween to try and be "cache friendly"
+	 	 elem_type_sort2 a[25];
+	 	 const size_t b=(n-1)/2; // middle item
+	 	 size_t c=b/2;// 1/4 point
+	 	 // copy values into array a in blocks of 5
+	 	 memcpy(a,x,5*sizeof(elem_type_sort2)); // to,from,size :start: x[0],x[1],x[2],x[3],x[4]
+	 	 memcpy(a+5,x+c-2,5*sizeof(elem_type_sort2));// 1/4 : x[c-2],x[c-1],x[c],x[c+1],x[c+2]
+		 memcpy(a+10,x+b-2,5*sizeof(elem_type_sort2)); // middle: x[b-2],x[b-1],x[b],x[b+1],x[b+2] 
+		 c+=b;// 3/4 point
+		 memcpy(a+15,x+c-2,5*sizeof(elem_type_sort2)); // 3/4: x[c-2],x[c-1],x[c],x[c+1],x[c+2]
+		 memcpy(a+20,x+n-5,5*sizeof(elem_type_sort2)); // end: x[n-5],x[n-4],x[n-3],x[n-2],x[n-1]
+		 v=opt_med25(a); 	
+		}
+	 else	
+  		 /* Median of 9 items selected uniformly across dataset  */
+		{
+	 	 elem_type_sort2 a[9];
+	 	 const size_t b=(n-1)/2; // middle item
+	 	 const size_t c=(n-1)/9; 
+	 	 a[0]=x[0];
+	 	 a[1]=x[2*c];
+	 	 a[2]=x[3*c];
+	 	 a[3]=x[4*c];
+	 	 a[4]=x[b];
+	 	 a[5]=x[6*c];
+	 	 a[6]=x[7*c];
+	 	 a[7]=x[8*c];
+	 	 a[8]=x[n-1];
+	 	 v=opt_med9(a);
+	 	}
+    /* partitioning scheme with pivot as a value (v) */
+	 // v is pivot (for this implementation this does not have to be a value thats actually in the array, but it should be in the range of values in the array !
+     /* this version uses pointers rather than array indices - this might be simpler for a compiler to optimise ? and it also avoids issues with unsigned values going to "-1" */
+	 /* with TDM-gcc 10.3.0 this is about 2% faster on the test program than a version using array indices */
+	 elem_type_sort2 *pi,*pj;// 2 pointers to replace indices i,j
+	 pi=x-1;// one before start as loop below starts with pi++
+	 pj=x+n; // one after end as loop below starts pj--
+	 for (;;) 
+	 	{
+		 do pi++; while (pj >= pi && *pi < v); // was i < n
+		 do pj--; while (pj >= pi && *pj > v);
+		 if (pj < pi) break; 
+		 //eswap2(pi-x,pj-x, x, y); // still uses indices
+		 eswap2p(pi,pj, x, y); 
+		}	
+	 pj=pi;// now look for multiple values equal to v adjacent to pivot position (pi)
+	 if(*pi==v)
+	 	{
+	 	 while(pi<x+n-1 && pi[1]==v) ++pi;
+	 	 while(pj>x && pj[-1]==v) --pj;
+	 	}	
+#ifdef PAR_SORT	 
+	  /* if using parallel tasks check here to see if task spawned from this function has finished, if so we can spawn another one to keep it busy
+	     We allow spawned tasks and recursive calls to spawn more tasks if we have enough processors (thats the (nos_p)/2 passed as a paramater)
+		 This approach should scale reasonably well without needing any complex interactions betweens tasks (as they are all working on seperate portions of the arrays x & y)
+		 With 1 processor everything is in main function.
+		 With 2 processors we use both
+		 With 4 processors we use 3 [ as (4)/2=2, then 2/2=1 , so we use main processor and spawn 2 threads]
+		 With 8 (logical) processors we use 7 [ (8)/2=4, (4)/2=2, then 2/2= 1 so we use main processor + 2 threads + 4 threads = 7 in total ]
+		 In terms of run time - using a processor with 8 logical cores (4 physical cores) available:
+		 1 core (PAR_SORT not defined) - test program sorting largest size took 52.662 secs
+		 4 cores																21.767 secs = 2.4* speedup
+		 8 cores 						 										19.350 secs = 2.7* speedup (there were only 4 physical cores which may partly explain the reduced improvement)
+	  */ 	
+	  if(th!=NULL && n>2*PAR_MIN_N )
+	  	{// if thread active and partition big enough that we might be able to use a parallel task (2* as we will at least halve the size of the partition for the parallel task) 
+	  	 if( WaitForSingleObject( th, 0 )!=WAIT_TIMEOUT)
+	  		{// if thread has finished
+    		 CloseHandle( th );// Destroy the thread object.
+			 th=NULL; // set to NULL so we can reuse it
+			 // printf("Thread finished at depth %d\n",depth);
+			}
+		}
+#endif     
+	 if((size_t)(pj-x)<n-(size_t)(pi-x)) // j<n-i
+	 	{
+#ifdef PAR_SORT
+		if( th==NULL && nos_p>1 && (size_t)(pj-x) > n/PAR_DIV_N && (size_t)(pj-x) > PAR_MIN_N)
+			{// use a worker thread last 2 tests check the overhead of thread creation is worth it.
+			 params.xp=x;
+			 params.yp=y;
+			 params.np=pj-x;
+			 params.nos_p_p=(nos_p)/2; // if we still have spare processors allow more threads to be started
+			 th=(HANDLE)_beginthreadex(NULL,0,yasortThreadFunc,&params,0,NULL);
+			 if(th==NULL) _yasort2(x,y, pj-x,0); // if starting thread fails then do in this process.  nos_p=0 so don't run any tasks from here
+			}
+		else
+			{_yasort2(x,y, pj-x,(nos_p)/2); // recurse for smalest partition so stack depth is bounded at O(log2(n)). Allow more threads from subroutine if we still have some processors spare
+			 assert(check_sort(x,pj-x));// check sort worked correctly
+			}
+#else	 		
+		 _yasort2(x,y, pj-x,0); // recurse for smalest partition so stack depth is bounded at O(log2(n))
+		 assert(check_sort(x,pj-x));// check sort worked correctly
+#endif		 
+
+	 	 y+=pi-x;
+	 	 n-=pi-x;
+	 	 x=pi;
+	 	}
+	 else
+	 	{
+#ifdef PAR_SORT
+		if( th==NULL && nos_p>1 && n-(pi-x) > n/PAR_DIV_N && n-(pi-x) > PAR_MIN_N)
+			{// use a worker thread
+			 params.xp=pi;
+			 params.yp=y+(pi-x);
+			 params.np=n-(pi-x);
+			 params.nos_p_p=(nos_p)/2;			 
+			 th=(HANDLE)_beginthreadex(NULL,0,yasortThreadFunc,&params,0,NULL);
+			 if(th==NULL) _yasort2(pi,y+(pi-x), n-(pi-x),0); // if starting thread fails then do in this process.  nos_p=0 so don't run any tasks from here
+			}
+		else
+			{ _yasort2(pi,y+(pi-x), n-(pi-x),(nos_p)/2); // recurse for smalest partition so stack depth is bounded at O(log2(n))
+	 	 	 assert(check_sort(pi,n-(pi-x))); // check sort worked correctly
+			}
+#else			
+		 _yasort2(pi,y+(pi-x), n-(pi-x),0);
+	 	 assert(check_sort(pi,n-(pi-x))); // check sort worked correctly
+#endif	 	 
+	 	 n=pj-x;	
+		}		 		 
+	} // end while(1)
+ sortend: ; // need a common end point as may be using threads in which case we need to wait for them to complete	
+ #ifdef PAR_SORT
+ if(th!=NULL)
+ 	{// if a thread used need to wait for it to finish
+ 	 WaitForSingleObject( th, INFINITE );
+    // Destroy the thread object.
+     CloseHandle( th );
+	}
+ #endif
+ return; 
+}
+
+void yasort2(elem_type_sort2 *x,elem_type_sort2 *y, size_t n) /* user function  */
+{
+#ifdef PAR_SORT	
+ int nos_p=nos_procs() ;// total number of (logical) processors available
+ _yasort2(x,y, n,nos_p); /* call main worker function */
+#else
+ _yasort2(x,y, n,1); /* call main worker function , with only 1 processor (as NOS_PAR not defined ) */
+#endif
+}
+
+
+#ifdef YA2SORT_TEST_PROGRAM /* test program required, times execution - while doing a set of benchmark tests for functionality */
+
+#include "hr_timer.h"
+#include <math.h>
+#include "yamedian.h" // we also test median code, so need header for that as well
+
+static size_t dataLen=100000001; /* should normally be 100000001 (needs this for median of 3 killer pattern) and elem_type_sort2 should be double for most tests */
+#define NOS_TESTS 3 /* number of runs of median() to time [normally 3 - we use median of this many runs ]*/
+#define TIMES_IGNORE 0 /* ignore this many longest runtimes [ normally 0 - median time which rejects outliers so this has little use now ] */
+
+/* use the #if chain below to select what you want to check/benchmark. */
+
+#if 1 /* test ya2sort  */
+
+// void qsort2(elem_type_sort2 *a,elem_type_sort2 *b, int n)
+#define MEDIAN_NAME "yasort2-2 partition " // name of algorithm we are testing
+#define median(a,b,s) (yasort2(a,b,s),a[(s-1)/2])  // function call to test 
+#define IS_SORT /* method sorts data */
+
+#else /* test heapsort2 */
+#define MEDIAN_NAME "heapsort2" // name of algorithm we are testing
+#define median(a,b,s) (heapsort2(a,b,s),a[(s-1)/2])  // function call to test 
+#define IS_SORT /* method sorts data */
+
+#endif
+
+/* insertion sort for small arrays of doubles (used in test program below) */
+static inline void dbl_ins_sort( double *x, const size_t n)
+	{
+	 if(n<2) return;// if length 1 then already sorted  
+     for (double *p=x; p<x+n-1; ++p) 
+		{
+		 double t = p[1];
+		 double *j = p;
+		 if(*j>t)
+		 	{// out of order 
+		 	 do
+		 		{j--;
+		 		} while(j>=x && *j>t);
+		 	 memmove(j+2,j+1,(p-j)*sizeof(elem_type_sort2));// move a portion of array x right by 1 to make space for t 		 
+		 	 j[1]=t;		
+		 	}
+        }							
+	 return ; // all done
+	}
+
+#define STR(x)   #x
+#define XSTR(x)  STR(x) /* generate a string that the macro expanded value of x */
+
+int main(int argc, char *argv[])
+{
+ elem_type_sort2 *data,*data2; // actual array of items that needs to be sorted - space allocated by malloc below 
+ size_t i;
+ elem_type_sort2 m,m_s=0;// m is median from my function, m_s is median from sort
+ double start_t,end_t; // for hr_timer
+ double durations[NOS_TESTS];
+ double av_duration=0;
+ double total_time=0;
+ double max_time=0;
+ int nos_lengths=0;// number of different lengths of data[] we have tested 
+ int errs=0;
+ uint64_t sum_before,sum_after; // sum of all values to be sorted (before sort, after sort)
+ uint32_t xor_before,xor_after; // ditto but xor of all values
+ FILE *logout=NULL;
+ P_UNUSED(argc) ;/* a way to avoid warning unused parameter messages from the compiler */
+ P_UNUSED(argv) ;/* a way to avoid warning unused parameter messages from the compiler */
+ init_HR_Timer();
+#if __SIZEOF_POINTER__ == 8 /* 64 bit pointers */
+ printf("Compiled for 64 bit pointers : ");
+#elif __SIZEOF_POINTER__ == 4 /* 32 bit pointers */
+ printf("Compiled for 32 bit pointers : ");
+#else
+ printf("Compiled for unknown pointer size ! : "); 
+#endif 
+#ifdef PAR_SORT
+ proc_info(); // info about the processor thats running this
+ printf("%d logical processor(s) available\n",nos_procs());
+#endif 
+ logout=fopen("logout.csv","w");
+ if(logout==NULL) printf("Warning: cannot open logfile\n");
+ else fprintf(logout,"Size,average time(s),max time(s)\n");
+ printf("elem_type_sort2=%s,sizeof(size_t)=%d log2(sizeof(size_t))=%d log2(%d)=%d\n",XSTR(elem_type_sort2),(int)sizeof(size_t),ilog2(sizeof(size_t)),(int)dataLen,ilog2(dataLen)); 
+ dprintf("RAND_MAX=%u (0x%x)\n",RAND_MAX,RAND_MAX);
+ printf("Timing %s(%d)\n",MEDIAN_NAME,(int)dataLen);
+ int testtype=0;// normally 0 to do all tests
+ size_t k;
+ data=(elem_type_sort2 *)malloc(dataLen * sizeof(elem_type_sort2));/// allocate memory for data to be searched for medians
+ if(data==NULL)
+ 	{fprintf(stderr,"Error - not enough RAM to allocate data array of size %u\n",(unsigned)dataLen);
+ 	 exit(1);
+ 	}
+ data2=(elem_type_sort2 *)malloc(dataLen * sizeof(elem_type_sort2));/// allocate memory for data to be searched for medians
+ if(data==NULL)
+ 	{fprintf(stderr,"Error - not enough RAM to allocate data array2 of size %u\n",(unsigned)dataLen);
+ 	 exit(1);
+ 	} 	
+ while(1)
+  {++testtype;// 1...
+   for(int test_nos=0;test_nos<NOS_TESTS;++test_nos) 
+ 	{
+ 	 switch(testtype) // 1...
+	  {case 1:	
+ 	 		for(i=0; i<dataLen;++i) data[i]=i; // just increasing
+ 	 		if(test_nos==0) printf("Test pattern is ramp 0...%d\n",(int)dataLen-1);
+			break;		 	 
+	   case 2:
+ 	 		for(i=0; i<dataLen;++i) data[i]=i+1; // just increasing 1,2,... but zero at end
+ 	 		data[i-1]=0; // 0 at end 
+ 	 		if(test_nos==0) printf("Test pattern is ramp 1...%d,0\n",(int)dataLen-1);
+ 	 		break;
+	   case 3:
+ 	 		for(i=0; i<dataLen;++i) data[i]=i&0xff; // sawtooth	
+ 	 		if(test_nos==0) printf("Test pattern is sawtooth 0..255\n");
+ 	 		break;
+ 	   case 4:
+ 			k=dataLen/2;
+ 			for(i=0;i<k;++i)
+				{
+	 	 		 data[i]=((i)&1)==0?i+1:i+k;// k=size/2; arr=1,k+1,3,k+3,5,.... 
+				} 
+ 			for(;i<dataLen;++i)
+				{
+	 	 		 data[i]=(i+1-k)*2;// k=size/2; arr=2,4,6....2k-2,2k 
+				} 	 
+ 			if(test_nos==0) printf("Test pattern is median of 3 killer:\n"); // *** needs size 10000001			 
+ 			break;
+ 		case 5: 
+ 			for(i=0;i<dataLen;++i)
+				{
+	 			data[i]=5;
+				}
+			if(test_nos==0)  printf("Array is all 5:\n");
+			break;
+ 		case 6: 
+ 			for(i=0;i<dataLen;++i)
+				{
+	 			data[i]=5;
+				}
+			data[dataLen/5]=0;// 2 different  values spread in array
+ 			data[dataLen/20]=10; 		
+			if(test_nos==0)  printf("Array is all 5 apart from 2 elements which are 0,10:\n");
+			break;	
+ 		case 7: 
+ 			for(i=0;i<dataLen;++i)
+				{
+	 			data[i]=(i&0xff) ^ 0x5a;//to give lots repeating values in a "random" order
+				}		
+			if(test_nos==0)  printf("Array has lots of repeating values in a random order:\n");
+			break;	
+ 		case 8: 
+ 			for(i=0;i<dataLen;++i)
+				{
+	 			data[i]=(i>>6);// >>6 = /64, array sizes upto 32 are special cases so wanted to be more than that
+				}		
+			if(test_nos==0)  printf("Array is slowly increasing with lots of repeating values:\n");
+			break;
+ 		case 9: 
+ 			for(i=0;i<dataLen;++i)
+				{
+	 			data[i]=dataLen-(i>>6);// >>6 = /64, array sizes upto 32 are special cases so wanted to be more than that
+				}		
+			if(test_nos==0)  printf("Array is slowly decreasing with lots of repeating values:\n");
+			break;
+		case 10: 
+			srand(testtype); // we always want the same sequence as we only want to check the median via sort once
+ 			for(i=0;i<dataLen;++i)
+				{
+	 			data[i]=rand();
+				}		
+			if(test_nos==0)  printf("Array is full of random values:\n");
+			break;	
+		case 11: 
+			srand(testtype); // we always want the same sequence as we only want to check the median via sort once
+ 			for(i=0;i<dataLen;++i)
+				{
+	 			data[i]=rand()&1;// 0 and 1 only
+				}		
+			if(test_nos==0)  printf("Array is full of random binary (0/1) values:\n");
+			break;	
+		case 12: 
+			srand(testtype); // we always want the same sequence as we only want to check the median via sort once
+ 			for(i=0;i<dataLen;++i)
+				{
+	 			data[i]=bi_modal_rand();
+				}		
+			if(test_nos==0)  printf("Array is full of bi-modally distributed random values:\n");
+			break;				
+	    case 13:	
+ 			for(i=0;i<dataLen;++i) data[i]=dataLen-1-i; // just decreasing
+ 	 		if(test_nos==0) printf("Test pattern is ramp %d...0\n",(int)dataLen-1);
+ 	 		break;
+	    case 14:
+ 	 		for(i=0;i<dataLen;++i) data[i]=dataLen-i; // just decreasing but zero at start
+ 	 		data[0]=0; // 0 at start 
+ 	 		if(test_nos==0) printf("Test pattern is ramp 0,%d...1\n",(int)dataLen-1);
+ 	 		break;	
+		case 15: 
+			srand(testtype); // we always want the same sequence as we only want to check the median via sort once
+ 			for(i=0;i<dataLen;++i)
+				{
+	 			data[i]=rand()/4 +rand()/4+rand()/4+rand()/4;// approximate a normal distribution by adding four uniform distribution (only true if elem_type_sort2 has 32 bits or more or is floating point ) 
+				}		
+			if(test_nos==0)  printf("Array is full of normally distributed values:\n");
+			break;	
+		case 16: 
+			// exponential ramp - repeating like a sawtooth. 0,1,2,4,8,16,32,... then back to 0,1,2,...
+			data[0]=0;
+ 			for(i=1;i<dataLen;++i)
+				{
+				 if(data[i-1]==0 ) data[i]=1;
+				 else data[i]=data[i-1]*2; // *2, assume overflows to 0 if elem_type_sort2 is an "integer" type.
+				 if(!isnormal((double)data[i])) data[i]=0;// if elem_type_sort2 is float or double then trap overflow and make 0 in the same was as integers behave.
+				}		
+			if(test_nos==0)  printf("Array is exponentially ramping sawtooth:\n");
+			break;		
+		case 17: 
+			// shuffle of simple sequence 0,..n-1
+ 	 		for(i=0; i<dataLen;++i) data[i]=i; // just increasing
+ 	 		if(test_nos==0) 
+			  	{printf("Test pattern is shuffle of ramp 0...%d\n",(int)dataLen-1);			
+			  	 srand(testtype); // initialise random generator - only do this once so we get different shuffles every time, as this is a shuffle all the means should be the same
+			  	}
+			// Fisher Yates shuffle see eg Programming classics page 244 algorithm 8.2.1 Random~Shuffle
+ 			for(i=0;i<dataLen-1;++i)
+				{
+				 eswap1(i, i+rand()%(dataLen-i),data); // swap element with random later element - the use of % will not quite give a uniform distribution for large dataLen but thats not really an issue here
+				}		
+			for(i=0;i<20;++i) {dprintf(" data[%d]=%.0f",(int)i,(double)data[i]);} dprintf("\n"); // print start of shuffled sequence
+			break;										  																 					 
+ 		default:
+ 			if(logout!=NULL) fprintf(logout,"%.0f,%.3f,%.3f\n",(double)dataLen,total_time/(testtype-1),max_time); 
+ 			printf("Sum of times for all sorts is %.3f secs, average sort time was %.3f secs, max sort time was %.3f secs\n",total_time,total_time/(testtype-1),max_time); 
+ 			testtype=1;// get ready to repeat with a different length for data[]
+ 			test_nos= -1;// -1 as incremented by for loop before next iteration
+ 			if((dataLen&1)==1) --dataLen; // change odd to even
+ 			else dataLen=(dataLen>>1)|1; // divide by 2 and make even
+ 			total_time=0;
+ 			max_time=0;
+ 			nos_lengths++;
+ 			if(nos_lengths==10) 
+			 	{if(errs==0)
+			 		printf("\n  no errors found during tests\n");
+			 	 else
+				 	printf("\n  a total of %d error(s) found during tests\n",errs);
+				 if(logout!=NULL) 
+				 	{printf("Time vs size stored in logout.csv\n");	
+				 	 fclose(logout);	
+				 	}
+				 exit(errs);// all done, normal exit, 0 means good <>0 means error(s)
+				}
+ 			printf("**** Size now is %d\n",(int)dataLen);
+ 			continue;
+ 	   }
+	 if(test_nos==0)
+	 	{ // 1st time for each testtype we calculate before sum and xor, no need to repeat this as it will not change for the same testtype	   
+	 	 sum_before=sum_after=0;
+	 	 xor_before=xor_after=0;
+		 for(i=0; i<dataLen;++i)  
+	 		{sum_before+=(uint32_t)data[i];
+	 	 	 xor_before ^= (uint32_t)data[i];
+	 	 	 data2[i]=-data[i]; // nice simple pattern to check
+	 		}		
+	 	}
+	 else
+	 	{ 	   
+	 	 sum_after=0;
+	 	 xor_after=0;	
+	 	 // always need to restore data2[]
+		 for(i=0; i<dataLen;++i)  
+	 		{
+	 	 	 data2[i]=-data[i]; // nice simple pattern to check
+	 		}				  	
+	 	}	 	
+ 	 // printf("S");  
+ 	 start_t=read_HR_Timer(); // start of timed section
+ 	 m=median(data,data2,dataLen); // warning median changes order of elements of array iarr!
+ 	 end_t=read_HR_Timer(); // end of timed section
+ 	 // printf("E");
+ 	 durations[test_nos]=end_t-start_t; // duration for this test
+	 dprintf("Check-full - sort:\n"); 	 
+ 	 // check we still have the same relationship between the 2 arrays, and the sum and xor are unchanged which means there is a high probability all the data items are unchanged. 
+ 	 // xor (longitudinal parity) will bit up an odd number of bit errors in every bit position, while the sum ("checksum") will detect most other errors
+	 for(i=0; i<dataLen;++i)  
+	 	{sum_after+=(uint32_t)data[i];
+	 	 xor_after ^= (uint32_t)data[i];
+	 	 if(data2[i] != -data[i])	 
+	 		{printf("  data[%.0f]=%.0f data2[%.0f]=%.0f expected %.0f\n",(double)i,(double)data[i],(double)i,(double)data2[i],(double)(-data[i])); 		 
+			 ++errs;
+			}
+		}
+	 if(sum_after!=sum_before)
+	 	{printf(" Error: Checksum changed by sort before=%llX after=%llX\n",sum_before,sum_after);
+		 ++errs;
+		}
+	 if(xor_after!=xor_before)
+	 	{printf(" Error: Parity changed by sort before=%X after=%X\n",xor_before,xor_after);
+		 ++errs;
+		}		
+
+   	 if(!check_sort(data,dataLen)) 
+   		{printf("  data is not sorted after median calculation\n");  
+   	 	 ++errs;
+   		} 
+   	 if(testtype==1 || testtype==2 || testtype==13 || testtype==14 || testtype==17)
+   	 	{ // all of these should give a sorted result that is easy to check exactly */
+   	 	 for(i=0; i<dataLen;++i) 
+			 	if(data[i]!=i) // just increasing
+		 			{printf("  data[%.0f]=%.0f expected %.0f\n",(double)i,(double)data[i],(double)i); 		 
+		 	 		 ++errs;
+		 			}
+		}
+ 	 if(test_nos==0)
+ 	 	{m_s=ya_median(data,dataLen); // get median using a method that does not change array (and does not involve sorting).
+   	     printf ("%s(iarr,%d)=%.0f (via ya_median is %.0f)",MEDIAN_NAME,(int)dataLen,(double)m,(double)m_s);
+   		 if(m==m_s) printf(" : OK\n"); 
+ 	  	 else 
+	   		{printf(" : Median incorrect!\n");
+	     	 ++errs;
+			}					
+ 	 	}
+	 else
+	 	{// no need to repeat ya_median() as we already know correct median (m_s)
+   		 if(m!=m_s)  
+	   		{printf ("%s(iarr,%d)=%.0f (via ya_median is %.0f)",MEDIAN_NAME,(int)dataLen,(double)m,(double)m_s);
+			 printf(" test_nos=%d : Median incorrect!\n",test_nos);
+	     	 ++errs;
+			}	 		
+		}
+ 	}
+   for(i=0;i<NOS_TESTS;++i)
+   	 printf("%c%.3f ms",i==0?' ':',',1000.0*durations[i]);
+   dbl_ins_sort(durations,NOS_TESTS); // sort times (array doubles) 
+   av_duration=0;
+   for(i=0;i<NOS_TESTS-TIMES_IGNORE;++i)
+ 		av_duration+=durations[i];
+   av_duration/=(double)(NOS_TESTS-TIMES_IGNORE);	
+ 
+   printf(" Sorting via %s(%d) took %.3f msecs [range was %.3f to %.3f ms, average %.3f ms] \n",MEDIAN_NAME,(int)dataLen,1000.0*durations[(NOS_TESTS-TIMES_IGNORE-1)/2]/*median*/,1000.0*durations[0],1000.0*durations[NOS_TESTS-TIMES_IGNORE-1],1000.0*av_duration);
+   total_time+=durations[(NOS_TESTS-TIMES_IGNORE-1)/2];// used to be durations[0] which added min times, but median is more reasonable as some test cases have a single min time thats much smaller than the other times.
+   if(durations[(NOS_TESTS-TIMES_IGNORE-1)/2] > max_time) max_time=durations[(NOS_TESTS-TIMES_IGNORE-1)/2];
+  } // end while(1)
+ exit(1); // should not get here 
+}
+#endif
diff --git a/yamedian.h b/yamedian.h
new file mode 100644
index 0000000..054b961
--- /dev/null
+++ b/yamedian.h
@@ -0,0 +1,21 @@
+/* yamedian.h */
+#ifndef __YAMEDIAN_H
+ #define __YAMEDIAN_H
+ #ifdef elem_type_median 
+  #error "attempt to redefine elem_type_median"  
+ #endif
+ 
+ /* the (single) #define below is the ONLY line you should edit in this header file! */
+ #define elem_type_median float /* type of array to process - should be an standard type for which compares are defined eg int,unsigned, float, double etc */
+
+ #ifdef __cplusplus
+  extern "C" {
+ #endif  
+ void yaselect(elem_type_median* r, size_t n, size_t length); // select or nth_element . Places the n th element of a sequence in the position that it would be if the array was sorted. Changes array r. 
+ elem_type_median yaMedian(elem_type_median *a, size_t s); // return median of array a using above code. As a side effect this changes array a. This is faster than ya_median().
+ elem_type_median ya_median(elem_type_median *m, const size_t n);// calculates median without changing array m. Uses free memory (via malloc) if available to speed process up - but will work with no free memory. This is slower than yaMedian().
+ void ya_msort(elem_type_median *a,size_t n); // sort based on yaMedian. Guaranteed O(n*log(n)) execution speed but ~ 50% slower than yasort() on test program 
+ #ifdef __cplusplus
+    }
+ #endif 
+#endif
diff --git a/yasort2.h b/yasort2.h
new file mode 100644
index 0000000..b6cb66e
--- /dev/null
+++ b/yasort2.h
@@ -0,0 +1,16 @@
+/* yasort2.h */
+#ifndef __YASORT2_H
+ #define __YASORT2_H
+ #if defined(elem_type_sort2) 
+  #error "attempt to redefine elem_type_sort2"  
+ #endif
+ /* the line below is the only line in this file you should edit ! */
+ #define elem_type_sort2 float /* type of array to process - should be an standard type for which compares are defined eg int,unsigned, float, double etc */
+ #ifdef __cplusplus
+  extern "C" {
+ #endif 
+ void yasort2(elem_type_sort2 *x,elem_type_sort2 *y, size_t n); /* general purpose sort for x, with y reordered at the same time so x,y pairs stay at the same array index*/
+ #ifdef __cplusplus
+    }
+ #endif
+#endif