-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathkdata.h
365 lines (326 loc) · 20.2 KB
/
kdata.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
#include <assert.h>
#include <stdio.h>
#include <stdint.h>
#include "admissible.h"
#include "invtab.h"
#include "b32.h"
#include "mem.h"
#include "cstd.h"
/*
This module performs k-specific precomputation, including creating tables of admissible z for d mod 27k and precomputing data used when lifting
arithmetic progressions modulo divisors of 54k.
We will always try to crt/lift mod one of 9, 18, 63, 81, 126, or 162 (at most 4 of these, depending on k) without increasing the number of progresssions.
Depending on how much room there is between d and zmax we will lift modulo larger divisors of 54k in order to take advantage of cubic reciprocity.
Everything in this module is precomputed and then shared (readonly)
*/
static int inline cubefree(uint32_t k)
{
uint64_t p,q;
for ( int i = 1 ; ; i++ ) {
p = _primes_small_primes[i];
if ( (q=p*p*p) > k ) return 1;
if ( !(k%q) ) return 0;
}
}
#define MAXK 1000 // Must exceed 2^16/27
#define K27MAXN 388 // Maximum number of admissible z's mod 27*k for any d mod 27*k for k < 1000 with 3-adic valuation 1
static inline int goodk(uint32_t k) { return k && k <= MAXK && !mod3(k) && mod9(k) && cubefree(k); }
// To measure the benefit of lifting arithmetic progressions of z (for a fixed d) viewed as a subset S of Z/rZ to Z/rmZ for some m coprime to r
// we use the "benefit" function below which returns a 32-bit approximation to 2^30 * 1-log(n)/log(m), where n is the number of admissible z's mod m
// the rational number benefit(n,m)/2^30 approximates the number of "bits per bit" we gain by lifing S mod r to S mod rm
// 0 bits per bit means the projection Z/rmZ -> Z/mZ has maximal fibers above every z in S mod r (so #(S mod rm) = m * #(S mod r))
// 1 bits per bit means the projection Z/rmZ -> Z/mZ restricts to an injection S mod rm -> S mod m (so #(S mod rm) = #(S mod r))
static inline uint32_t benefit(int n, int m) { softassert(n>0 && n <= m); return floor((1<<30)*(1.0-log(n)/log(m))); }
static uint16_t K; // global shared value of k stored in 16 bits (we don't use k because this is often a local variable/counter)
static uint16_t kpcnt; // number of primes other then 3 dividing k (could be 0)
static uint16_t kp[4]; // list of primes that divide k
static uint8_t kv[4]; // kv[i] is the valuation of k at the prime kp[i]
static uint16_t kq[4]; // kp[i]^kv[i]
static int keps; // 1 for k=3 mod 9, -1 for k=6 mod 9
static inline unsigned sgnz_index (uint64_t d) { return (unsigned)(1+keps*ui64_kronecker3(d))>>1; } // sgnz index (0=-1, 1=1)
static uint16_t k27; // 27k
static uint16_t k27m; // Divisor of 27k throught which cubic reciprocity constraints factor; not currently used
static uint64_t k27minv; // floor(2^64/(27k)) // switch to floor(2^64/m)) when k27m is used
static uint16_t *k27zs; // dynamicall allocated buffer into which entries in k27tab point and k27ftab[i]->ztab point
static uint16_t *k27itab; // table of inverses of d mod 27k indexed by d mod 27k (followed by tables for k27ftab[i]->itab)
static uint16_t *k27zcnts; // table of counts of admissible z's mod 27k, indexed by d mod 27k (followed by tables for k27ftab[i]->zcnts)
static uint32_t *k27ztab; // table of offests into k27zs, indexed by d mod 27k (followed by tables for k27ftab[i]->ztab)
static uint64_t kdmax[9]; // use d <= dbmax[i] to test d*kd[i] <= dmax, zero terminated, at most 8 nonzero entries
static uint16_t kdmin; // smallest admissible d|k greater than 1 (0 if none)
static uint16_t kdcnt; // number of admissible d|k
static struct kdrec { // table with records for each admissible d|k
uint16_t d; // admissible d | k (so d is prime to 3 and v_p(d) = v_p(k) for all p|d (note k is cubefree)
uint8_t n; // number of cuberoots of k mod d (just 1 if d is squarefree, since then 0 is the only cuberoot)
uint8_t fi; // index into k27ftab of m = lcm(9*d,2) -- we will always lift at least to this m (which will have zcnts[d] == n)
uint32_t fmask; // bitmask whose ith bit is set if k27ftab[i].m is properly divisible by d and beneficial
} kdtab[8]; // for k <= 1000 at most 7 nontrivial admissible d | k, we use entry 0 for 1 (meaning d is coprime to k)
#define MAXK27FCNT 24 // maximum number of divisors m of 27k divisible by gcd(18,3k) for k < 1000
static uint16_t k27fcnt; // actual number of divisors m of 27k divisible by gcd(18,3k) for our k
static struct k27frec { // table of these divisors m
uint16_t m; // gcd(3k,18) | m | 27k
uint16_t unused;
uint32_t ztot; // total number of zs stored in k27zs for m (at most 2*m*m/3, but usually a lot less)
uint64_t minv[4]; // floor(2^64/m), floor(2^64/(2m)), floor(2^64/(7m)), floor(2^64/(14m)) precomputed for Barrett reduction
uint16_t *itab; // table of inverses of d mod m indexed by d mod m
uint16_t *zcnts; // zcnts[d] is length of ztab[d] for d mod m
uint32_t *ztab; // table of offsets into k27zs, indexed by d mod m
} k27ftab[MAXK27FCNT];
static inline uint32_t crt2 (uint64_t za, uint64_t a, uint32_t z2) { return za + a*(z2^(za&1)); }
static uint8_t _inv7[7] = {0,1,4,5,2,3,6};
static inline uint32_t inv7 (uint64_t a) { return _inv7[mod7(a)]; }
static inline uint32_t crt7 (uint64_t za, uint64_t a, uint32_t z7) { return za + a*mod7((z7+7-mod7(za))*inv7(a)); }
static inline uint32_t crt14 (uint64_t za, uint64_t a, uint32_t z14) { return crt2(crt7(za,a,mod7(z14)),7*a,z14&1); }
// km1,km2,km7,km14 are all divisible by gcd(3k,18) (and by 81 for k=3) and have a unique z per d
// these are the moduli we will use when d is coprime to k (this includes all prime d since we never allow d|k)
static uint32_t km1,km2,km7,km14,kmzs[243],kminvs[243]; // 243 is max(81+162,9+18+63+126)
static uint32_t *km1ztab,*km2ztab,*km7ztab,*km14ztab; // pointers into kmztab
static uint32_t *km1itab,*km2itab,*km7itab,*km14itab; // pointers into kmitab
static uint32_t km[4], *kmztab[4], *kmitab[4]; // km[] = {km1,km2,km7,km14}, ...
static uint64_t *kminv;
#define MAXK27BCNT 58 // for admissible k < 1000 (of these at most 45 come from beneficial m)
static uint16_t k27bcnt; // number of distinct benefit(w,m) values
static struct k27brec {
uint16_t fi; // index of divisor of 27k listed in k27ftab
uint16_t w; // number of residue classes mod m
uint32_t b; // benfit(w,m)
uint32_t fmask; // bitmap with ith bit set, for the unique i for which k27ftab[i].m == m
} k27btab[MAXK27BCNT];
static uint8_t *k27rtab; // list of k27fcnt-element vectors of indexes into 27btab, entry d*k27fcnt holds the entry for d mod k27
static int ui16_cmp (const void *a, const void *b)
{ return *((uint16_t*)a) < *((uint16_t*)b) ? -1 : ( *((int16_t*)a) > *((int16_t*)b) ? 1 : 0 ); }
static int ui32_cmp (const void *a, const void *b)
{ return *((uint32_t*)a) < *((uint32_t*)b) ? -1 : ( *((int32_t*)a) > *((int32_t*)b) ? 1 : 0 ); }
//static int ui64_cmp (const void *a, const void *b)
// { return *((uint64_t*)a) < *((uint64_t*)b) ? -1 : ( *((int64_t*)a) > *((int64_t*)b) ? 1 : 0 ); }
static inline uint16_t k27red (uint64_t a)
{ return b32_red (a,k27,k27minv); }
// returns a list of indexes into k27ftab of divisors compatible with ki (properly divisible by kdtab[ki]) ranked by benefit
// such that each divisor is larger than the previous and brings additional benefit relative to the previous one
static inline unsigned ranked_mi (uint8_t mis[MAXK27FCNT], uint32_t bs[MAXK27FCNT], uint64_t d, uint16_t ki) // ki is an index into kdtab
{
if ( K == 3 ) return 0;
struct k27brec *x;
uint8_t *r = k27rtab + k27red(d)*k27fcnt;
uint32_t m, mm, w;
unsigned i, n;
softassert (ki < kdcnt);
uint32_t fmask = kdtab[ki].fmask;
softassert (fmask);
// get our first choice (in terms of benefit/bit)
for ( i = 0 ; i < k27fcnt ; i++ ) { x = k27btab+r[i]; if ( fmask & x->fmask ) { mis[0] = x->fi; bs[0] = x->b; w = x->w; m = k27ftab[mis[0]].m; break; } }
softassert ( i < k27fcnt );
// add successively worse divisors as long as they are larger than the previous and offer some relative benefit
for ( n = 1 ; i < k27fcnt ; i++ )
{ x = k27btab+r[i]; if ( fmask & x->fmask && (mm=k27ftab[x->fi].m) > m && w*mm > m*x->w ) { mis[n] = x->fi; bs[n++] = x->b; w = x->w; m = mm; } }
return n;
}
// sets d[] to a sorted list of divisors of 16-bit integer with prime factorization prod_{1<=i<=n} p[i]^e[i], returns length = number of divisiors
static inline int ui16_divisors (uint16_t d[], uint16_t p[], int e[], int n)
{
int v[n];
int i, j, k, s;
assert (n);
memset(v,0,sizeof(v));
for ( s = 0 ; v[0] <= e[0] ; s++ ) {
for ( i = 0, k = 1 ; i < n ; i++ ) for ( j = 0 ; j < v[i] ; j++ ) k *= p[i];
assert (!s||k != d[s-1]);
d[s] = k;
v[n-1]++;
for ( i = n-1 ; i && v[i] > e[i] ; i-- ) { v[i] = 0; v[i-1]++; }
}
qsort (d, s, sizeof(d[0]), ui16_cmp);
return s;
}
void precompute_kdata (uint32_t k, uint64_t dmax)
{
uint64_t *bm, bytes, mem=0;
uint32_t *ztab, *w32, z;
uint16_t *invs, *cnts, *zs, *w16;
uint64_t p, minv;
int i, ii, j, d, dd, m, mm, n, max, tot, totzs;
int sk27;
assert (goodk(k));
report_timer_reset();
// compute the prime factorization of k (at most 4 prime factors)
for ( i = 1 ; (p=_primes_small_primes[i]) <= k ; i++ ) {
if ( k%p ) continue;
j = ( k > p*p && k%(p*p) == 0 ) ? 2 : 1;
if ( p == 3 ) assert ( j == 1 );
assert ( k < p*p*p || k%(p*p*p) ); // k must be cubefree
assert ( kpcnt < 4 );
kp[kpcnt] = p; kv[kpcnt] = j; kq[kpcnt] = j == 1 ? p : p*p;
kpcnt++;
}
K = k;
keps = ( mod9(k)==3 ? 1 : -1 );
k27 = 27*k; k27minv = b32_inv(k27);
// compute list of divisors of 27*k
uint16_t k27factors[64];
int k27e[kpcnt];
for ( i = 0 ; i < kpcnt ; i++ ) k27e[i] = (kp[i] == 3) ? 4 : kv[i];
k27fcnt = ui16_divisors(k27factors, kp, k27e, kpcnt);
assert (k27fcnt*sizeof(k27factors[0]) < sizeof(k27factors) && k27factors[0] == 1 && k27factors[k27fcnt-1] == k27);
// remove divisors not divisible by mmin = k==3 ? 81 : gcd(18,3k), we will always use m divisible by mmin, since the benefit for mmin is 1 bit/bit
mm = (k==3 ? 81 : 9*(k&1?1:2));
for ( i = j = 0 ; i < k27fcnt ; i++ ) if ( !(k27factors[i]%mm) ) k27factors[j++] = k27factors[i];
k27fcnt = j;
assert (k27fcnt && k27fcnt*sizeof(k27ftab[0]) <= sizeof(k27ftab));
for ( i = 0, sk27 = 0 ; i < k27fcnt ; i++ ) sk27 += k27factors[i];
// precompute table of inverses mod 27*k and factors thereof
k27itab = invs = shared_calloc (bytes=sk27*sizeof(*k27itab)); mem += bytes;
k27zcnts = cnts = shared_calloc (bytes=sk27*sizeof(*k27zcnts)); mem += bytes;
k27ztab = ztab = shared_calloc (bytes=sk27*sizeof(*k27ztab)); mem += bytes;
w16 = malloc(2*k27*sizeof(*w16)); // workspace for calls to invtab16
int k7 = mod7(k*k) == 4; // if k is +/-2 mod 7 we will want to lift mod 7m for 3/7 of the d
for ( i = k27fcnt-1 ; i >= 0 ; i-- ) {
k27ftab[i].m = m = k27factors[i];
k27ftab[i].minv[0] = minv = b32_inv(k27ftab[i].m);
k27ftab[i].minv[1] = (m&1) ? b32_inv(2*k27ftab[i].m) : 0;
k27ftab[i].minv[2] = k7 ? b32_inv(7*k27ftab[i].m) : 0;
k27ftab[i].minv[3] = (m&1) && k7 ? b32_inv(14*k27ftab[i].m) : 0;
k27ftab[i].itab = invs;
k27ftab[i].zcnts = cnts;
k27ftab[i].ztab = ztab;
if ( m == k27 ) assert (invs == k27itab);
invtab16 (invs, m, kp, kpcnt, w16);
invs += m; cnts += m; ztab += m;
}
assert (k27ftab[0].m == mm);
free (w16);
// compute admissible z's mod 27k (replaces load_kfile)
n = admissible (&k27zs, &k27m, k27ztab, k27zcnts, k);
// we now want to reduce admissible zs and ds mod all proper divisors m of 27k
// first figure out how much space we are going to need
bm = bm_alloc (k27);
totzs = 1; // skip first entry (zero offset is reserved for null
for ( i = k27fcnt-1 ; i >= 0 ; i-- ) {
m = k27ftab[i].m; minv = k27ftab[i].minv[0];
cnts = k27ftab[i].zcnts;
tot = 0; max = 0;
for ( d = 1 ; d < m ; d++ ) { // d is prime to 3 hence not 0 mod m (which is divisible by 9)
if ( !(d%3) ) continue;
if ( m < k27 ) {
bm_erase (bm,m);
// mark residue classes mod m of admissible z mod k27 for all lifts of d mod k27
for ( dd = d ; dd < k27 ; dd += m ) for ( j = 0 ; j < k27zcnts[dd] ; j++ ) bm_set(bm,b32_red(k27zs[k27ztab[dd]+j],m,minv));
cnts[d] = bm_weight(bm,m);
}
if ( !cnts[d] ) continue;
tot += cnts[d]; if ( cnts[d] > max ) max = cnts[d];
// add (m,w) to benefit table if it is new (this is a linear search of a list of at most 50 elements)
for ( j = 0 ; j < k27bcnt ; j++ ) if ( k27btab[j].fi == i && k27btab[j].w == cnts[d] ) break;
if ( j == k27bcnt ) {
assert (sizeof(k27btab[0])*k27bcnt < sizeof(k27btab));
k27btab[j].fi = i; k27btab[j].w = cnts[d]; k27btab[j].b = m == mm ? (1<<30) : benefit (cnts[d],m/mm); k27btab[j].fmask = (uint32_t)1 << i;
k27bcnt++;
}
}
k27ftab[i].ztot = tot;
if ( i == k27fcnt-1 ) assert (tot == n);
totzs += tot;
//if ( m > mm ) report_printf ("27k-divisor m=%u has an average of %.2f z's per admissible d, max %d, worstcase relative benefit is %.3f bits/bit\n", m, (double)tot/(2*m/3), max, 1.0-log(max)/log(m/mm));
}
// extend k27zs and put it in shared memory
void *ptr = shared_malloc (bytes=totzs*sizeof(*k27zs)); mem += bytes;
memcpy (ptr, k27zs, k27ftab[k27fcnt-1].ztot*sizeof(*k27zs)); free (k27zs); k27zs = ptr;
zs = k27zs + 1 + k27ftab[k27fcnt-1].ztot;
for ( i = k27fcnt-2 ; i >= 0 ; i-- ) {
m = k27ftab[i].m; minv = k27ftab[i].minv[0];
cnts = k27ftab[i].zcnts;
ztab = k27ftab[i].ztab;
for ( d = 1 ; d < m ; d++ ) { // d is prime to 3 hence not 0 mod m (which is divisible by 9)
if ( !cnts[d] ) continue;
ztab[d] = zs-k27zs;
bm_erase (bm,m);
// mark residue classes mod m of admissibles z mod k27 for all lifts of d mod k27
for ( dd = d ; dd < k27 ; dd += m ) for ( j = 0 ; j < k27zcnts[dd] ; j++ ) bm_set(bm, b32_red(k27zs[k27ztab[dd]+j],m,minv));
for ( ii = 0, j = bm_next_set(bm,0,m) ; j < m && ii < cnts[d] ; j = bm_next_set(bm,j+1,m), ii++ ) *zs++ = j;
assert (j >= m && ii == cnts[d]);
assert (zs-k27zs <= totzs);
}
}
bm_free (bm);
assert (zs-k27zs == totzs);
// Set up km1,km2,km7,km14 -- these are multiples of mm=9,18,81 that have only one z per d
// if k is even only km1,km7 are relevant, and k7,k14 are relevant only for k=+/-2 mod 7
// We will always lift modulo a multiple of the largest of these we can for a given d, and for large d we won't consider anything else.
km1 = k27ftab[0].m; if ( km1&1 ) km2 = 2*km1;
if ( mod7(k*k) == 4 ) { km7 = 7*km1; km14 = 7*km2; };
km1itab = kminvs; w32 = km1itab+km1; km1ztab = kmzs; ztab = km1ztab+km1;
if ( km2 ) { km2itab = w32; w32 += km2; km2ztab = ztab; ztab += km2; }
if ( km7 ) { km7itab = w32; w32 += km7; km7ztab = ztab; ztab += km7; }
if ( km14 ) { km14itab = w32; km14ztab = ztab; }
w32 = malloc(2*_max(_max(km1,km2),_max(km7,km14))*sizeof(*w32)); // workspace for calls to invtab32
uint32_t kp1[kpcnt]; for ( i = 0 ; i < kpcnt ; i++ ) kp1[i] = kp[i];
invtab32 (km1itab, km1, kp1, kpcnt, w32);
cnts = k27ftab[0].zcnts; ztab = k27ftab[0].ztab;
for ( d = 1 ; d < km1 ; d++ ) if ( cnts[d] ) { assert (cnts[d]==1); km1ztab[d] = k27zs[ztab[d]]; }
if ( km2 ) {
uint32_t kp2[kpcnt+1]; kp2[0] = 2; for ( i = 0 ; i < kpcnt ; i++ ) kp2[i+1] = kp[i];
invtab32 (km2itab, km2, kp2, kpcnt+1, w32);
// z=0 mod 2 only depends on d mod km1 (and both d and km1 are odd if we are in this case, but set both
for ( d = 1 ; d < km1 ; d++ ) if ( cnts[d] ) { z = crt2(k27zs[ztab[d]],km1,0); km2ztab[d] = km2ztab[d+km1] = z; }
}
if ( km7 ) {
uint32_t kp7[kpcnt+1]; for ( i = 0 ; i < kpcnt && kp[i] < 7 ; i++ ) { kp7[i] = kp[i]; } for ( kp7[i] = 7; i < kpcnt ; i++ ) kp7[i+1] = kp[i];
invtab32 (km7itab, km7, kp7, kpcnt+1, w32);
// z=0 mod 7 only depends on d mod km1 (this will only happen for suitable d, those for which onezmod7(d,si) returns true), here we assume this holds)
for ( d = 1 ; d < km1 ; d++ ) if ( cnts[d] ) { z = crt7(k27zs[ztab[d]],km1,0); for ( i = 0 ; i < 7 ; i++ ) km7ztab[d+km1*i] = z; }
}
if ( km14 ) {
uint32_t kp14[kpcnt+2]; kp14[0] = 2; for ( i = 0 ; i < kpcnt && kp[i] < 7; i++ ) { kp14[i+1] = kp[i]; } for ( kp14[i+1] = 7; i < kpcnt ; i++ ) kp14[i+2] = kp[i];
invtab32 (km14itab, km14, kp14, kpcnt+2, w32);
// z=0 mod 14 only depends on d mod km1 (this will only happen for suitable d, those for which onezmod7(d,si) returns true), here we assume this holds)
for ( d = 1 ; d < km1 ; d++ ) if ( cnts[d] ) { z = crt14(k27zs[ztab[d]],km1,0); for ( i = 0 ; i < 14 ; i++ ) km14ztab[d+km1*i] = z; }
}
free (w32);
km[0] = km1; km[1] = km2; km[2] = km7; km[3] = km14;
verbose_printf ("km=[%u,%u,%u,%u]\n",km1,km2,km7,km14);
kmitab[0] = km1itab; kmitab[1] = km2itab; kmitab[2] = km7itab; kmitab[3] = km14itab;
kmztab[0] = km1ztab; kmztab[1] = km2ztab; kmztab[2] = km7ztab; kmztab[3] = km14ztab;
kminv = k27ftab[0].minv;
// rank k27 divisors for each d mod 27k according to benefit
k27rtab = shared_calloc (bytes=sk27*k27fcnt*sizeof(*k27rtab)); mem += bytes;
for ( d = 1 ; d < k27 ; d++ ) {
if ( !k27zcnts[d] ) continue;
uint8_t *r = k27rtab+d*k27fcnt;
for ( i = 0 ; i < k27fcnt ; i++ ) {
m = k27ftab[i].m; minv = k27ftab[i].minv[0];
ii = k27ftab[i].zcnts[b32_red(d,m,minv)];
for ( j = 0 ; j < k27bcnt ; j++ ) if ( k27btab[j].fi == i && k27btab[j].w == ii ) break;
assert ( j < k27bcnt );
r[i] = j;
}
// sort r in decreasing order by k27btab[r[i]].b
for ( i = 1 ; i < k27fcnt ; i++ ) { ii = r[i]; for ( j = i-1; j >= 0 && k27btab[r[j]].b < k27btab[ii].b ; j--) { r[j+1] = r[j]; } r[j+1] = ii; }
}
// We now remove 3 from list of prime factors of k, since it cannot divide any admissible d
for ( i = j = 0 ; i < kpcnt ; i++ ) if ( kp[i] != 3 ) { kp[j] = kp[i]; kv[j] = kv[i]; kq[j] = kq[i]; j++; }
kpcnt = j;
// Because k is cubefree, any admissible d must satisfy v_p(d) = v_p(k) for all primes p|d
kdcnt = (1<<kpcnt);
kdmin = 0;
for ( i = 0 ; i < kdcnt ; i++ ) {
m = n = 1; // compute d = m*n*n with m and n squarefree and coprime, for k <= 1000 we have n <= 31
for ( int j = 0 ; j < kpcnt ; j++ ) if ( i&(1<<j) ) { if ( kv[j] == 2 ) n *= kp[j]; else m *= kp[j]; }
kdtab[i].d = m*n*n;
kdtab[i].n = n;
// now lookup m=lcm(d,mm) in k27ftab to get fi, and compute fmask identifying divisors of 27k we might want to lift to for this d
m = ui64_lcm(kdtab[i].d,mm);
kdtab[i].fmask = 0;
kdtab[i].fi = k27fcnt; // sanity check
for ( j = 0 ; j < k27fcnt ; j++ ) {
if ( k27ftab[j].m == m ) kdtab[i].fi = j;
else if ( !(k27ftab[j].m % m) ) {
n = k27ftab[j].m / m;
if ( n%4 && n%49 && n%169 ) kdtab[i].fmask |= 1<<j; // relative factors of 2^2, 7^2, 13^2 bring 0 benefit, so skip these
}
}
assert (kdtab[i].fi < k27fcnt);
if ( kdtab[i].d > 1 ) if ( ! kdmin || kdtab[i].d < kdmin ) kdmin = kdtab[i].d;
}
qsort (kdtab, kdcnt, sizeof(kdtab[0]), ui16_cmp);
for ( int i = 0 ; i < kdcnt ; i++ ) kdmax[i] = dmax / kdtab[i].d;
report_printf ("Precomputed %u (d,z) pairs via cubic reciprocity mod divisors of 27*k=%d in %.1fs using %.1f MB shared memory\n",
totzs, k27, report_timer_elapsed(), (double) mem/(1<<20));
}