HMSBEAGLE  1.0.0
GPUImplDefs.h
1 /*
2  *
3  * Copyright 2009 Phylogenetic Likelihood Working Group
4  *
5  * This file is part of BEAGLE.
6  *
7  * BEAGLE is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as
9  * published by the Free Software Foundation, either version 3 of
10  * the License, or (at your option) any later version.
11  *
12  * BEAGLE is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with BEAGLE. If not, see
19  * <http://www.gnu.org/licenses/>.
20  *
21  * @author Marc Suchard
22  * @author Daniel Ayres
23  */
24 
25 #ifndef __GPUImplDefs__
26 #define __GPUImplDefs__
27 
28 #ifdef HAVE_CONFIG_H
29 #include "libhmsbeagle/config.h"
30 #endif
31 #include "libhmsbeagle/platform.h"
32 
33 #include <cfloat>
34 
35 //#define BEAGLE_DEBUG_FLOW
36 //#define BEAGLE_DEBUG_VALUES
37 //#define BEAGLE_DEBUG_SYNCH
38 
39 #define BEAGLE_MEMORY_PINNED
40 //#define BEAGLE_FILL_4_STATE_SCALAR_SS
41 //#define BEAGLE_FILL_4_STATE_SCALAR_SP
42 
43 #define BEAGLE_CACHED_MATRICES_COUNT 3 // max number of matrices that can be cached for a single memcpy to device operation
44 
45 /* Definition of REAL can be switched between 'double' and 'float' */
46 #ifdef DOUBLE_PRECISION
47  #define REAL double
48  #define REAL_MIN DBL_MIN
49  #define REAL_MAX DBL_MAX
50  #define SCALING_FACTOR_COUNT 2046 // -1022, 1023
51  #define SCALING_FACTOR_OFFSET 1022 // the zero point
52  #define SCALING_EXPONENT_THRESHOLD 200 // TODO: find optimal value for SCALING_EXPONENT_THRESHOLD
53  #define SCALING_THRESHOLD_LOWER 6.22301528e-61 // TODO: find optimal value for SCALING_THRESHOLD
54  #define SCALING_THRESHOLD_UPPER 1.60693804e60 // TODO: find optimal value for SCALING_THRESHOLD
55 #else
56  #define REAL float
57  #define REAL_MIN FLT_MIN
58  #define REAL_MAX FLT_MAX
59  #define SCALING_FACTOR_COUNT 254 // -126, 127
60  #define SCALING_FACTOR_OFFSET 126 // the zero point
61  #define SCALING_EXPONENT_THRESHOLD 20 // TODO: find optimal value for SCALING_EXPONENT_THRESHOLD
62  #define SCALING_THRESHOLD_LOWER 9.53674316e-7 // TODO: find optimal value for SCALING_THRESHOLD
63  #define SCALING_THRESHOLD_UPPER 1048576 // TODO: find optimal value for SCALING_THRESHOLD
64 #endif
65 
66 #define SIZE_REAL sizeof(REAL)
67 #define INT int
68 #define SIZE_INT sizeof(INT)
69 
70 /* Compiler definitions
71  *
72  * PADDED_STATE_COUNT - # of total states after augmentation
73  * *should* be a multiple of 16
74  *
75  * PATTERN_BLOCK_SIZE - # of patterns to pack onto each thread-block in pruning
76  * ( x 4 for PADDED_STATE_COUNT==4)
77  * PATTERN_BLOCK_SIZE * PADDED_STATE_COUNT <= 512
78  *
79  * MATRIX_BLOCK_SIZE - # of matrices to pack onto each thread-block in integrating
80  * likelihood and store in dynamic weighting;
81  * MATRIX_BLOCK_SIZE * PADDED_STATE_COUNT <= 512
82  * - TODO: Currently matrixCount must be < MATRIX_BLOCK_SIZE, fix!
83  *
84  * BLOCK_PEELING_SIZE - # of the states to pre-fetch in inner-sum in pruning;
85  * BLOCK_PEELING_SIZE <= PATTERN_BLOCK_SIZE and
86  * *must* be a divisor of PADDED_STATE_COUNT
87  *
88  * IS_POWER_OF_TWO - 1 if PADDED_STATE_COUNT = 2^{N} for some integer N, otherwise 0
89  *
90  * SMALLEST_POWER_OF_TWO - Smallest power of 2 greater than or equal to PADDED_STATE_COUNT
91  * (if not already a power of 2)
92  *
93  * SLOW_REWEIGHING - 1 if requires the slow reweighing algorithm, otherwise 0
94  *
95  */
96 
97 /* Table of pre-optimized compiler definitions
98  */
99 
100 // SINGLE PRECISION definitions
101 
102 // PADDED_STATE_COUNT == 4
103 #define PATTERN_BLOCK_SIZE_SP_4 16
104 #define MATRIX_BLOCK_SIZE_SP_4 8
105 #define BLOCK_PEELING_SIZE_SP_4 8
106 #define IS_POWER_OF_TWO_SP_4 1
107 #define SMALLEST_POWER_OF_TWO_SP_4 4
108 #define SLOW_REWEIGHING_SP_4 0
109 
110 // PADDED_STATE_COUNT == 16
111 // TODO: find optimal settings
112 #define PATTERN_BLOCK_SIZE_SP_16 8
113 #define MATRIX_BLOCK_SIZE_SP_16 8
114 #define BLOCK_PEELING_SIZE_SP_16 8
115 #define IS_POWER_OF_TWO_SP_16 1
116 #define SMALLEST_POWER_OF_TWO_SP_16 16
117 #define SLOW_REWEIGHING_SP_16 0
118 
119 // PADDED_STATE_COUNT == 32
120 // TODO: find optimal settings
121 #define PATTERN_BLOCK_SIZE_SP_32 8
122 #define MATRIX_BLOCK_SIZE_SP_32 8
123 #define BLOCK_PEELING_SIZE_SP_32 8
124 #define IS_POWER_OF_TWO_SP_32 1
125 #define SMALLEST_POWER_OF_TWO_SP_32 32
126 #define SLOW_REWEIGHING_SP_32 0
127 
128 // PADDED_STATE_COUNT == 48
129 #define PATTERN_BLOCK_SIZE_SP_48 8
130 #define MATRIX_BLOCK_SIZE_SP_48 8
131 #define BLOCK_PEELING_SIZE_SP_48 8
132 #define IS_POWER_OF_TWO_SP_48 0
133 #define SMALLEST_POWER_OF_TWO_SP_48 64
134 #define SLOW_REWEIGHING_SP_48 0
135 
136 // PADDED_STATE_COUNT == 64
137 #define PATTERN_BLOCK_SIZE_SP_64 8
138 #define MATRIX_BLOCK_SIZE_SP_64 8
139 #define BLOCK_PEELING_SIZE_SP_64 8
140 #define IS_POWER_OF_TWO_SP_64 1
141 #define SMALLEST_POWER_OF_TWO_SP_64 64
142 #define SLOW_REWEIGHING_SP_64 0
143 
144 // PADDED_STATE_COUNT == 80
145 #define PATTERN_BLOCK_SIZE_SP_80 8
146 #define MATRIX_BLOCK_SIZE_SP_80 8
147 #define BLOCK_PEELING_SIZE_SP_80 8
148 #define IS_POWER_OF_TWO_SP_80 0
149 #define SMALLEST_POWER_OF_TWO_SP_80 128
150 #define SLOW_REWEIGHING_SP_80 1
151 
152 // PADDED_STATE_COUNT == 128
153 #define PATTERN_BLOCK_SIZE_SP_128 4
154 #define MATRIX_BLOCK_SIZE_SP_128 8
155 #define BLOCK_PEELING_SIZE_SP_128 2
156 #define IS_POWER_OF_TWO_SP_128 1
157 #define SMALLEST_POWER_OF_TWO_SP_128 128
158 #define SLOW_REWEIGHING_SP_128 1
159 
160 // PADDED_STATE_COUNT == 192
161 #define PATTERN_BLOCK_SIZE_SP_192 2
162 #define MATRIX_BLOCK_SIZE_SP_192 8
163 #define BLOCK_PEELING_SIZE_SP_192 2
164 #define IS_POWER_OF_TWO_SP_192 0
165 #define SMALLEST_POWER_OF_TWO_SP_192 256
166 #define SLOW_REWEIGHING_SP_192 1
167 
168 // DOUBLE PRECISION definitions TODO None of these have been checked
169 
170 // PADDED_STATE_COUNT == 4
171 #define PATTERN_BLOCK_SIZE_DP_4 16
172 #define MATRIX_BLOCK_SIZE_DP_4 8
173 #define BLOCK_PEELING_SIZE_DP_4 8
174 #define IS_POWER_OF_TWO_DP_4 1
175 #define SMALLEST_POWER_OF_TWO_DP_4 4
176 #define SLOW_REWEIGHING_DP_4 0
177 
178 // PADDED_STATE_COUNT == 16
179 #define PATTERN_BLOCK_SIZE_DP_16 8
180 #define MATRIX_BLOCK_SIZE_DP_16 8
181 #define BLOCK_PEELING_SIZE_DP_16 8
182 #define IS_POWER_OF_TWO_DP_16 1
183 #define SMALLEST_POWER_OF_TWO_DP_16 16
184 #define SLOW_REWEIGHING_DP_16 0
185 
186 // PADDED_STATE_COUNT == 32
187 #define PATTERN_BLOCK_SIZE_DP_32 8
188 #define MATRIX_BLOCK_SIZE_DP_32 8
189 #define BLOCK_PEELING_SIZE_DP_32 8
190 #define IS_POWER_OF_TWO_DP_32 1
191 #define SMALLEST_POWER_OF_TWO_DP_32 32
192 #define SLOW_REWEIGHING_DP_32 0
193 
194 // PADDED_STATE_COUNT == 48
195 #define PATTERN_BLOCK_SIZE_DP_48 8
196 #define MATRIX_BLOCK_SIZE_DP_48 8
197 #define BLOCK_PEELING_SIZE_DP_48 8
198 #define IS_POWER_OF_TWO_DP_48 0
199 #define SMALLEST_POWER_OF_TWO_DP_48 64
200 #define SLOW_REWEIGHING_DP_48 0
201 
202 // PADDED_STATE_COUNT == 64
203 #define PATTERN_BLOCK_SIZE_DP_64 8
204 #define MATRIX_BLOCK_SIZE_DP_64 8
205 #define BLOCK_PEELING_SIZE_DP_64 4 // Can use 8 on GTX480
206 #define IS_POWER_OF_TWO_DP_64 1
207 #define SMALLEST_POWER_OF_TWO_DP_64 64
208 #define SLOW_REWEIGHING_DP_64 0
209 
210 // PADDED_STATE_COUNT == 80
211 #define PATTERN_BLOCK_SIZE_DP_80 8
212 #define MATRIX_BLOCK_SIZE_DP_80 8
213 #define BLOCK_PEELING_SIZE_DP_80 4 // Can use 8 on GTX480
214 #define IS_POWER_OF_TWO_DP_80 0
215 #define SMALLEST_POWER_OF_TWO_DP_80 128
216 #define SLOW_REWEIGHING_DP_80 1
217 
218 // PADDED_STATE_COUNT == 128
219 #define PATTERN_BLOCK_SIZE_DP_128 4
220 #define MATRIX_BLOCK_SIZE_DP_128 8
221 #define BLOCK_PEELING_SIZE_DP_128 2
222 #define IS_POWER_OF_TWO_DP_128 1
223 #define SMALLEST_POWER_OF_TWO_DP_128 128
224 #define SLOW_REWEIGHING_DP_128 1
225 
226 // PADDED_STATE_COUNT == 192
227 #define PATTERN_BLOCK_SIZE_DP_192 2
228 #define MATRIX_BLOCK_SIZE_DP_192 8
229 #define BLOCK_PEELING_SIZE_DP_192 2
230 #define IS_POWER_OF_TWO_DP_192 0
231 #define SMALLEST_POWER_OF_TWO_DP_192 256
232 #define SLOW_REWEIGHING_DP_192 1
233 
234 #ifdef STATE_COUNT
235 #if (STATE_COUNT == 4 || STATE_COUNT == 16 || STATE_COUNT == 32 || STATE_COUNT == 48 || STATE_COUNT == 64 || STATE_COUNT == 80 || STATE_COUNT == 128 || STATE_COUNT == 192)
236  #define PADDED_STATE_COUNT STATE_COUNT
237 #else
238  #error *** Precompiler directive state count not defined ***
239 #endif
240 #endif
241 
242 // Need nested macros: first for replacement, second for evaluation
243 #define GET2_NO_CALL(x, y) x##_##y
244 #define GET2_VALUE(x, y) GET2_NO_CALL(x, y)
245 #define GET_NO_CALL(x, y, z) x##_##y##_##z
246 #define GET_VALUE(x, y, z) GET_NO_CALL(x, y, z)
247 
248 #ifdef DOUBLE_PRECISION
249  #define PREC DP
250 #else
251  #define PREC SP
252 #endif
253 
254 #define PATTERN_BLOCK_SIZE GET_VALUE(PATTERN_BLOCK_SIZE, PREC, PADDED_STATE_COUNT)
255 #define MATRIX_BLOCK_SIZE GET_VALUE(MATRIX_BLOCK_SIZE, PREC, PADDED_STATE_COUNT)
256 #define BLOCK_PEELING_SIZE GET_VALUE(BLOCK_PEELING_SIZE, PREC, PADDED_STATE_COUNT)
257 #define CHECK_IS_POWER_OF_TWO GET_VALUE(IS_POWER_OF_TWO, PREC, PADDED_STATE_COUNT)
258 #if (CHECK_IS_POWER_OF_TWO == 1)
259  #define IS_POWER_OF_TWO
260 #endif
261 #define SMALLEST_POWER_OF_TWO GET_VALUE(SMALLEST_POWER_OF_TWO, PREC, PADDED_STATE_COUNT)
262 #define CHECK_SLOW_REWEIGHING GET_VALUE(SLOW_REWEIGHING, PREC, PADDED_STATE_COUNT)
263 #if (CHECK_SLOW_REWEIGHING == 1)
264  #define SLOW_REWEIGHING
265 #endif
266 
267 
268 // State count independent
269 #define SUM_SITES_BLOCK_SIZE_DP 128
270 #define SUM_SITES_BLOCK_SIZE_SP 128
271 #define MULTIPLY_BLOCK_SIZE_DP 16
272 #define MULTIPLY_BLOCK_SIZE_SP 16
273 
274 #define SUM_SITES_BLOCK_SIZE GET2_VALUE(SUM_SITES_BLOCK_SIZE, PREC)
275 #define MULTIPLY_BLOCK_SIZE GET2_VALUE(MULTIPLY_BLOCK_SIZE, PREC)
276 
277 #define MEMCNV(to, from, length, toType) { \
278  int m; \
279  for(m = 0; m < length; m++) { \
280  to[m] = (toType) from[m]; \
281  } \
282  }
283 
284 typedef struct Dim3Int Dim3Int;
285 
286 struct Dim3Int
287 {
288  unsigned int x, y, z;
289 #if defined(__cplusplus)
290  Dim3Int(unsigned int xArg = 1,
291  unsigned int yArg = 1,
292  unsigned int zArg = 1) : x(xArg), y(yArg), z(zArg) {}
293 #endif /* __cplusplus */
294 };
295 
296 #endif // __GPUImplDefs__