HMSBEAGLE
1.0.0
Main Page
Namespaces
Classes
Files
File List
libhmsbeagle
GPU
GPUImplDefs.h
1
/*
2
*
3
* Copyright 2009 Phylogenetic Likelihood Working Group
4
*
5
* This file is part of BEAGLE.
6
*
7
* BEAGLE is free software: you can redistribute it and/or modify
8
* it under the terms of the GNU Lesser General Public License as
9
* published by the Free Software Foundation, either version 3 of
10
* the License, or (at your option) any later version.
11
*
12
* BEAGLE is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
* GNU Lesser General Public License for more details.
16
*
17
* You should have received a copy of the GNU Lesser General Public
18
* License along with BEAGLE. If not, see
19
* <http://www.gnu.org/licenses/>.
20
*
21
* @author Marc Suchard
22
* @author Daniel Ayres
23
*/
24
25
#ifndef __GPUImplDefs__
26
#define __GPUImplDefs__
27
28
#ifdef HAVE_CONFIG_H
29
#include "libhmsbeagle/config.h"
30
#endif
31
#include "libhmsbeagle/platform.h"
32
33
#include <cfloat>
34
35
//#define BEAGLE_DEBUG_FLOW
36
//#define BEAGLE_DEBUG_VALUES
37
//#define BEAGLE_DEBUG_SYNCH
38
39
#define BEAGLE_MEMORY_PINNED
40
//#define BEAGLE_FILL_4_STATE_SCALAR_SS
41
//#define BEAGLE_FILL_4_STATE_SCALAR_SP
42
43
#define BEAGLE_CACHED_MATRICES_COUNT 3 // max number of matrices that can be cached for a single memcpy to device operation
44
45
/* Definition of REAL can be switched between 'double' and 'float' */
46
#ifdef DOUBLE_PRECISION
47
#define REAL double
48
#define REAL_MIN DBL_MIN
49
#define REAL_MAX DBL_MAX
50
#define SCALING_FACTOR_COUNT 2046 // -1022, 1023
51
#define SCALING_FACTOR_OFFSET 1022 // the zero point
52
#define SCALING_EXPONENT_THRESHOLD 200 // TODO: find optimal value for SCALING_EXPONENT_THRESHOLD
53
#define SCALING_THRESHOLD_LOWER 6.22301528e-61 // TODO: find optimal value for SCALING_THRESHOLD
54
#define SCALING_THRESHOLD_UPPER 1.60693804e60 // TODO: find optimal value for SCALING_THRESHOLD
55
#else
56
#define REAL float
57
#define REAL_MIN FLT_MIN
58
#define REAL_MAX FLT_MAX
59
#define SCALING_FACTOR_COUNT 254 // -126, 127
60
#define SCALING_FACTOR_OFFSET 126 // the zero point
61
#define SCALING_EXPONENT_THRESHOLD 20 // TODO: find optimal value for SCALING_EXPONENT_THRESHOLD
62
#define SCALING_THRESHOLD_LOWER 9.53674316e-7 // TODO: find optimal value for SCALING_THRESHOLD
63
#define SCALING_THRESHOLD_UPPER 1048576 // TODO: find optimal value for SCALING_THRESHOLD
64
#endif
65
66
#define SIZE_REAL sizeof(REAL)
67
#define INT int
68
#define SIZE_INT sizeof(INT)
69
70
/* Compiler definitions
71
*
72
* PADDED_STATE_COUNT - # of total states after augmentation
73
* *should* be a multiple of 16
74
*
75
* PATTERN_BLOCK_SIZE - # of patterns to pack onto each thread-block in pruning
76
* ( x 4 for PADDED_STATE_COUNT==4)
77
* PATTERN_BLOCK_SIZE * PADDED_STATE_COUNT <= 512
78
*
79
* MATRIX_BLOCK_SIZE - # of matrices to pack onto each thread-block in integrating
80
* likelihood and store in dynamic weighting;
81
* MATRIX_BLOCK_SIZE * PADDED_STATE_COUNT <= 512
82
* - TODO: Currently matrixCount must be < MATRIX_BLOCK_SIZE, fix!
83
*
84
* BLOCK_PEELING_SIZE - # of the states to pre-fetch in inner-sum in pruning;
85
* BLOCK_PEELING_SIZE <= PATTERN_BLOCK_SIZE and
86
* *must* be a divisor of PADDED_STATE_COUNT
87
*
88
* IS_POWER_OF_TWO - 1 if PADDED_STATE_COUNT = 2^{N} for some integer N, otherwise 0
89
*
90
* SMALLEST_POWER_OF_TWO - Smallest power of 2 greater than or equal to PADDED_STATE_COUNT
91
* (if not already a power of 2)
92
*
93
* SLOW_REWEIGHING - 1 if requires the slow reweighing algorithm, otherwise 0
94
*
95
*/
96
97
/* Table of pre-optimized compiler definitions
98
*/
99
100
// SINGLE PRECISION definitions
101
102
// PADDED_STATE_COUNT == 4
103
#define PATTERN_BLOCK_SIZE_SP_4 16
104
#define MATRIX_BLOCK_SIZE_SP_4 8
105
#define BLOCK_PEELING_SIZE_SP_4 8
106
#define IS_POWER_OF_TWO_SP_4 1
107
#define SMALLEST_POWER_OF_TWO_SP_4 4
108
#define SLOW_REWEIGHING_SP_4 0
109
110
// PADDED_STATE_COUNT == 16
111
// TODO: find optimal settings
112
#define PATTERN_BLOCK_SIZE_SP_16 8
113
#define MATRIX_BLOCK_SIZE_SP_16 8
114
#define BLOCK_PEELING_SIZE_SP_16 8
115
#define IS_POWER_OF_TWO_SP_16 1
116
#define SMALLEST_POWER_OF_TWO_SP_16 16
117
#define SLOW_REWEIGHING_SP_16 0
118
119
// PADDED_STATE_COUNT == 32
120
// TODO: find optimal settings
121
#define PATTERN_BLOCK_SIZE_SP_32 8
122
#define MATRIX_BLOCK_SIZE_SP_32 8
123
#define BLOCK_PEELING_SIZE_SP_32 8
124
#define IS_POWER_OF_TWO_SP_32 1
125
#define SMALLEST_POWER_OF_TWO_SP_32 32
126
#define SLOW_REWEIGHING_SP_32 0
127
128
// PADDED_STATE_COUNT == 48
129
#define PATTERN_BLOCK_SIZE_SP_48 8
130
#define MATRIX_BLOCK_SIZE_SP_48 8
131
#define BLOCK_PEELING_SIZE_SP_48 8
132
#define IS_POWER_OF_TWO_SP_48 0
133
#define SMALLEST_POWER_OF_TWO_SP_48 64
134
#define SLOW_REWEIGHING_SP_48 0
135
136
// PADDED_STATE_COUNT == 64
137
#define PATTERN_BLOCK_SIZE_SP_64 8
138
#define MATRIX_BLOCK_SIZE_SP_64 8
139
#define BLOCK_PEELING_SIZE_SP_64 8
140
#define IS_POWER_OF_TWO_SP_64 1
141
#define SMALLEST_POWER_OF_TWO_SP_64 64
142
#define SLOW_REWEIGHING_SP_64 0
143
144
// PADDED_STATE_COUNT == 80
145
#define PATTERN_BLOCK_SIZE_SP_80 8
146
#define MATRIX_BLOCK_SIZE_SP_80 8
147
#define BLOCK_PEELING_SIZE_SP_80 8
148
#define IS_POWER_OF_TWO_SP_80 0
149
#define SMALLEST_POWER_OF_TWO_SP_80 128
150
#define SLOW_REWEIGHING_SP_80 1
151
152
// PADDED_STATE_COUNT == 128
153
#define PATTERN_BLOCK_SIZE_SP_128 4
154
#define MATRIX_BLOCK_SIZE_SP_128 8
155
#define BLOCK_PEELING_SIZE_SP_128 2
156
#define IS_POWER_OF_TWO_SP_128 1
157
#define SMALLEST_POWER_OF_TWO_SP_128 128
158
#define SLOW_REWEIGHING_SP_128 1
159
160
// PADDED_STATE_COUNT == 192
161
#define PATTERN_BLOCK_SIZE_SP_192 2
162
#define MATRIX_BLOCK_SIZE_SP_192 8
163
#define BLOCK_PEELING_SIZE_SP_192 2
164
#define IS_POWER_OF_TWO_SP_192 0
165
#define SMALLEST_POWER_OF_TWO_SP_192 256
166
#define SLOW_REWEIGHING_SP_192 1
167
168
// DOUBLE PRECISION definitions TODO None of these have been checked
169
170
// PADDED_STATE_COUNT == 4
171
#define PATTERN_BLOCK_SIZE_DP_4 16
172
#define MATRIX_BLOCK_SIZE_DP_4 8
173
#define BLOCK_PEELING_SIZE_DP_4 8
174
#define IS_POWER_OF_TWO_DP_4 1
175
#define SMALLEST_POWER_OF_TWO_DP_4 4
176
#define SLOW_REWEIGHING_DP_4 0
177
178
// PADDED_STATE_COUNT == 16
179
#define PATTERN_BLOCK_SIZE_DP_16 8
180
#define MATRIX_BLOCK_SIZE_DP_16 8
181
#define BLOCK_PEELING_SIZE_DP_16 8
182
#define IS_POWER_OF_TWO_DP_16 1
183
#define SMALLEST_POWER_OF_TWO_DP_16 16
184
#define SLOW_REWEIGHING_DP_16 0
185
186
// PADDED_STATE_COUNT == 32
187
#define PATTERN_BLOCK_SIZE_DP_32 8
188
#define MATRIX_BLOCK_SIZE_DP_32 8
189
#define BLOCK_PEELING_SIZE_DP_32 8
190
#define IS_POWER_OF_TWO_DP_32 1
191
#define SMALLEST_POWER_OF_TWO_DP_32 32
192
#define SLOW_REWEIGHING_DP_32 0
193
194
// PADDED_STATE_COUNT == 48
195
#define PATTERN_BLOCK_SIZE_DP_48 8
196
#define MATRIX_BLOCK_SIZE_DP_48 8
197
#define BLOCK_PEELING_SIZE_DP_48 8
198
#define IS_POWER_OF_TWO_DP_48 0
199
#define SMALLEST_POWER_OF_TWO_DP_48 64
200
#define SLOW_REWEIGHING_DP_48 0
201
202
// PADDED_STATE_COUNT == 64
203
#define PATTERN_BLOCK_SIZE_DP_64 8
204
#define MATRIX_BLOCK_SIZE_DP_64 8
205
#define BLOCK_PEELING_SIZE_DP_64 4 // Can use 8 on GTX480
206
#define IS_POWER_OF_TWO_DP_64 1
207
#define SMALLEST_POWER_OF_TWO_DP_64 64
208
#define SLOW_REWEIGHING_DP_64 0
209
210
// PADDED_STATE_COUNT == 80
211
#define PATTERN_BLOCK_SIZE_DP_80 8
212
#define MATRIX_BLOCK_SIZE_DP_80 8
213
#define BLOCK_PEELING_SIZE_DP_80 4 // Can use 8 on GTX480
214
#define IS_POWER_OF_TWO_DP_80 0
215
#define SMALLEST_POWER_OF_TWO_DP_80 128
216
#define SLOW_REWEIGHING_DP_80 1
217
218
// PADDED_STATE_COUNT == 128
219
#define PATTERN_BLOCK_SIZE_DP_128 4
220
#define MATRIX_BLOCK_SIZE_DP_128 8
221
#define BLOCK_PEELING_SIZE_DP_128 2
222
#define IS_POWER_OF_TWO_DP_128 1
223
#define SMALLEST_POWER_OF_TWO_DP_128 128
224
#define SLOW_REWEIGHING_DP_128 1
225
226
// PADDED_STATE_COUNT == 192
227
#define PATTERN_BLOCK_SIZE_DP_192 2
228
#define MATRIX_BLOCK_SIZE_DP_192 8
229
#define BLOCK_PEELING_SIZE_DP_192 2
230
#define IS_POWER_OF_TWO_DP_192 0
231
#define SMALLEST_POWER_OF_TWO_DP_192 256
232
#define SLOW_REWEIGHING_DP_192 1
233
234
#ifdef STATE_COUNT
235
#if (STATE_COUNT == 4 || STATE_COUNT == 16 || STATE_COUNT == 32 || STATE_COUNT == 48 || STATE_COUNT == 64 || STATE_COUNT == 80 || STATE_COUNT == 128 || STATE_COUNT == 192)
236
#define PADDED_STATE_COUNT STATE_COUNT
237
#else
238
#error *** Precompiler directive state count not defined ***
239
#endif
240
#endif
241
242
// Need nested macros: first for replacement, second for evaluation
243
#define GET2_NO_CALL(x, y) x##_##y
244
#define GET2_VALUE(x, y) GET2_NO_CALL(x, y)
245
#define GET_NO_CALL(x, y, z) x##_##y##_##z
246
#define GET_VALUE(x, y, z) GET_NO_CALL(x, y, z)
247
248
#ifdef DOUBLE_PRECISION
249
#define PREC DP
250
#else
251
#define PREC SP
252
#endif
253
254
#define PATTERN_BLOCK_SIZE GET_VALUE(PATTERN_BLOCK_SIZE, PREC, PADDED_STATE_COUNT)
255
#define MATRIX_BLOCK_SIZE GET_VALUE(MATRIX_BLOCK_SIZE, PREC, PADDED_STATE_COUNT)
256
#define BLOCK_PEELING_SIZE GET_VALUE(BLOCK_PEELING_SIZE, PREC, PADDED_STATE_COUNT)
257
#define CHECK_IS_POWER_OF_TWO GET_VALUE(IS_POWER_OF_TWO, PREC, PADDED_STATE_COUNT)
258
#if (CHECK_IS_POWER_OF_TWO == 1)
259
#define IS_POWER_OF_TWO
260
#endif
261
#define SMALLEST_POWER_OF_TWO GET_VALUE(SMALLEST_POWER_OF_TWO, PREC, PADDED_STATE_COUNT)
262
#define CHECK_SLOW_REWEIGHING GET_VALUE(SLOW_REWEIGHING, PREC, PADDED_STATE_COUNT)
263
#if (CHECK_SLOW_REWEIGHING == 1)
264
#define SLOW_REWEIGHING
265
#endif
266
267
268
// State count independent
269
#define SUM_SITES_BLOCK_SIZE_DP 128
270
#define SUM_SITES_BLOCK_SIZE_SP 128
271
#define MULTIPLY_BLOCK_SIZE_DP 16
272
#define MULTIPLY_BLOCK_SIZE_SP 16
273
274
#define SUM_SITES_BLOCK_SIZE GET2_VALUE(SUM_SITES_BLOCK_SIZE, PREC)
275
#define MULTIPLY_BLOCK_SIZE GET2_VALUE(MULTIPLY_BLOCK_SIZE, PREC)
276
277
#define MEMCNV(to, from, length, toType) { \
278
int m; \
279
for(m = 0; m < length; m++) { \
280
to[m] = (toType) from[m]; \
281
} \
282
}
283
284
typedef
struct
Dim3Int
Dim3Int
;
285
286
struct
Dim3Int
287
{
288
unsigned
int
x, y, z;
289
#if defined(__cplusplus)
290
Dim3Int
(
unsigned
int
xArg = 1,
291
unsigned
int
yArg = 1,
292
unsigned
int
zArg = 1) : x(xArg), y(yArg), z(zArg) {}
293
#endif
/* __cplusplus */
294
};
295
296
#endif // __GPUImplDefs__
Generated on Fri May 25 2012 13:22:53 for HMSBEAGLE by
1.8.1