-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathswitches.h
132 lines (100 loc) · 5.39 KB
/
switches.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
// this file contains the various compile time settings/swithes
#define DEBUG_PRINT_MOVES 0
#if DEBUG_PRINT_MOVES == 1
#define DEBUG_PRINT_DEPTH 6
bool printMoves = false;
#endif
// can be tuned as per need
// 256 works best for Maxwell
// 384 best for newer chips!
// (also make sure max registers used is set to 47 on maxwell and 64 on newer chips) or set max registers to 0 and enable LIMIT_REGISTER_USAGE
#define BLOCK_SIZE 384
// limit max used registers to 64 for some kernels
// improves occupancy and performance (but doesn't work with debug info or debug builds)
#define LIMIT_REGISTER_USE 1
// 3 works best with 384 block size on new chips
#define MIN_BLOCKS_PER_MP 3
// 768 MB preallocated memory size (for holding the perft tree in GPU memory)
// on systems with more video memory (like Titan X), we can use 3x of this to hold bigger trees
//#define PREALLOCATED_MEMORY_SIZE (1536 * 1024 * 1024ull)
#define PREALLOCATED_MEMORY_SIZE (3072 * 1024 * 1024ull)
// 512 KB ought to be enough for holding the stack for the serial part of the gpu perft
#define GPU_SERIAL_PERFT_STACK_SIZE (512 * 1024)
// use constant memory for accessing lookup tables (except for magic tables as they are huge)
// the default is to use texture cache via __ldg instruction
// (doesn't really affect performance either way. Maybe just a tiny bit slower with fancy magics)
// Ankan - improves performance on Maxwell a LOT!
// - but hurts performance on Newer hardware
#define USE_CONSTANT_MEMORY_FOR_LUT 0
// make use of a hash table to avoid duplicate calculations due to transpositions
#define USE_TRANSPOSITION_TABLE 1
#if USE_TRANSPOSITION_TABLE == 1
// flag en-passent capture only when it's possible (in next move)
// default is to flag en-passent on every double pawn push
// This switch works only using using makeMove()
// This helps A LOT when using transposition tables (> 50% improvement in perft(12) time)!
// very slight regression without transposition tables so enable by default only when TT is enabled
#define EXACT_EN_PASSENT_FLAGGING 1
// find duplicates in the each level of the parallel breadth first search
// and make sure we explore them only once
#define FIND_DUPLICATES_IN_BFS 1
// windows 64 bit vs 32 bit vs linux 64 bit compromise :-/
// Windows allows overclocking (gives about 10% extra performance)
// Windows 32 bit build is 7% faster than windows 64 bit build (for some unknown reason??)!
// .. but 32 bit windows build is address space limited.. best settings: don't use sysmem hash, TT_BITS: 25, TT2_BITS: 25, TT3_BITS: 26, TT4_BITS: 25 (total 1.5 GB HASH)
// .. 64 bit windows doesn't allow creating pow-2 vidmem allocations bigger than 1 GB, and sysmem allocations bigger than 2 GB. best settings: TT_BITS: 27, TT2/TT3 BITS: 26, TT4_BITS: 27 (total 4 GB HASH using 2 GB sysmem)
// .. Linux allows using any amount of memory (limited by sysmem/vidmem sizes), so best setting is to distribute all of available vidmem and sysmem to cover all transposition tables adequately
// 9 JUL 2016: Looks like on Windows 10 and with latest drivers, there is no longer any limitation.
// .. except for cudaHostAlloc() which doesn't seem to allow allocating > 6GB of memory on 16 GB system :-/
// size of the transposition table (in number of entries)
// for entries of 16 bytes (shallow TT entries)
// 28 bits: 256 million entries -> 4 GB
// 27 bits: 128 million entries -> 2 GB
// 25 bits: 32 million entries -> 512 MB
#define TT_SIZE_FROM_BITS (1ull << TT_BITS)
// size is simply 2 power of bits
#define GET_TT_SIZE_FROM_BITS(bits) (1ull << (bits))
// bits of the hash used as index into the transposition table
#define GET_TT_INDEX_BITS(bits) (GET_TT_SIZE_FROM_BITS(bits) - 1)
// remaining bits (that are stored per hash entry)
#define GET_TT_HASH_BITS(bits) (ALLSET ^ GET_TT_INDEX_BITS(bits))
#endif
// count the no of times countMoves() got called (useful to find hash table effectiveness)
#define COUNT_NUM_COUNT_MOVES 0
// print various hash statistics
#define PRINT_HASH_STATS 0
// move generation functions templated on chance
// +9% benefit on GM204
#define USE_TEMPLATE_CHANCE_OPT 1
// bitwise magic instead of if/else for castle flag updation
// turning this off helps Maxwell a little
#define USE_BITWISE_MAGIC_FOR_CASTLE_FLAG_UPDATION 0
// intel core 2 doesn't have popcnt instruction
#define USE_POPCNT 0
// pentium 4 doesn't have fast HW bitscan
#define USE_HW_BITSCAN 1
// use lookup tabls for figuring out squares in line and squares in between
#define USE_IN_BETWEEN_LUT 1
// use lookup table for king moves
#define USE_KING_LUT 1
// use lookup table for knight moves
#define USE_KNIGHT_LUT 1
// use lookup table (magics) for sliding moves
// reduces performance by ~7% for GPU version
// helps maxwell a lot (> +10%)
#define USE_SLIDING_LUT 1
// use fancy fixed-shift version - ~ 800 KB lookup tables
// (setting this to 0 enables plain magics - with 2.3 MB lookup table)
// plain magics is a bit faster at least for perft (on core 2 duo)
// fancy magics is clearly faster on more recent processors (ivy bridge)
// fancy magics a very very tiny bit faster for Maxwell (actually almost exactly same speed)
#define USE_FANCY_MAGICS 1
// use byte lookup for fancy magics (~150 KB lookup tables)
// around 3% slower than fixed shift fancy magics on CPU
// and > 10% slower on GPU!
#define USE_BYTE_LOOKUP_FANCY 0
#ifdef __CUDACC__
#define CUDA_CALLABLE_MEMBER __host__ __device__
#else
#define CUDA_CALLABLE_MEMBER
#endif