30 #include "libhmsbeagle/config.h"
39 #include "libhmsbeagle/beagle.h"
40 #include "libhmsbeagle/GPU/GPUImplDefs.h"
41 #include "libhmsbeagle/GPU/BeagleGPUImpl.h"
42 #include "libhmsbeagle/GPU/GPUImplHelper.h"
43 #include "libhmsbeagle/GPU/KernelLauncher.h"
44 #include "libhmsbeagle/GPU/GPUInterface.h"
45 #include "libhmsbeagle/GPU/Precision.h"
51 BeagleGPUImpl<BEAGLE_GPU_GENERIC>::BeagleGPUImpl() {
56 dIntegrationTmp = (GPUPtr)NULL;
57 dOutFirstDeriv = (GPUPtr)NULL;
58 dOutSecondDeriv = (GPUPtr)NULL;
59 dPartialsTmp = (GPUPtr)NULL;
60 dFirstDerivTmp = (GPUPtr)NULL;
61 dSecondDerivTmp = (GPUPtr)NULL;
63 dSumLogLikelihood = (GPUPtr)NULL;
64 dSumFirstDeriv = (GPUPtr)NULL;
65 dSumSecondDeriv = (GPUPtr)NULL;
67 dPatternWeights = (GPUPtr)NULL;
69 dBranchLengths = (GPUPtr)NULL;
71 dDistanceQueue = (GPUPtr)NULL;
73 dPtrQueue = (GPUPtr)NULL;
75 dMaxScalingFactors = (GPUPtr)NULL;
76 dIndexMaxScalingFactors = (GPUPtr)NULL;
85 dScalingFactors = NULL;
92 dCompactBuffers = NULL;
93 dTipPartialsBuffers = NULL;
97 hCategoryRates = NULL;
99 hPatternWeightsCache = NULL;
101 hDistanceQueue = NULL;
103 hWeightsCache = NULL;
104 hFrequenciesCache = NULL;
105 hLogLikelihoodsCache = NULL;
106 hPartialsCache = NULL;
110 hRescalingTrigger = NULL;
111 dRescalingTrigger = (GPUPtr)NULL;
112 dScalingFactorsMaster = NULL;
116 BeagleGPUImpl<BEAGLE_GPU_GENERIC>::~BeagleGPUImpl() {
119 for (
int i=0; i < kEigenDecompCount; i++) {
120 gpu->FreeMemory(dEigenValues[i]);
121 gpu->FreeMemory(dEvec[i]);
122 gpu->FreeMemory(dIevc[i]);
123 gpu->FreeMemory(dWeights[i]);
124 gpu->FreeMemory(dFrequencies[i]);
127 gpu->FreeMemory(dMatrices[0]);
129 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
130 gpu->FreePinnedHostMemory(hRescalingTrigger);
131 for (
int i = 0; i < kScaleBufferCount; i++) {
132 if (dScalingFactorsMaster[i] != 0)
133 gpu->FreeMemory(dScalingFactorsMaster[i]);
135 free(dScalingFactorsMaster);
137 if (kScaleBufferCount > 0)
138 gpu->FreeMemory(dScalingFactors[0]);
141 for (
int i = 0; i < kBufferCount; i++) {
143 if (i < kCompactBufferCount)
144 gpu->FreeMemory(dCompactBuffers[i]);
145 if (i < kTipPartialsBufferCount)
146 gpu->FreeMemory(dTipPartialsBuffers[i]);
148 gpu->FreeMemory(dPartials[i]);
152 gpu->FreeMemory(dIntegrationTmp);
153 gpu->FreeMemory(dOutFirstDeriv);
154 gpu->FreeMemory(dOutSecondDeriv);
155 gpu->FreeMemory(dPartialsTmp);
156 gpu->FreeMemory(dFirstDerivTmp);
157 gpu->FreeMemory(dSecondDerivTmp);
159 gpu->FreeMemory(dSumLogLikelihood);
160 gpu->FreeMemory(dSumFirstDeriv);
161 gpu->FreeMemory(dSumSecondDeriv);
163 gpu->FreeMemory(dPatternWeights);
165 gpu->FreeMemory(dBranchLengths);
167 gpu->FreeMemory(dDistanceQueue);
169 gpu->FreeMemory(dPtrQueue);
171 gpu->FreeMemory(dMaxScalingFactors);
172 gpu->FreeMemory(dIndexMaxScalingFactors);
174 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
175 gpu->FreeMemory(dAccumulatedScalingFactors);
185 free(dScalingFactors);
192 free(dCompactBuffers);
193 free(dTipPartialsBuffers);
195 gpu->FreeHostMemory(hPtrQueue);
197 gpu->FreeHostMemory(hCategoryRates);
199 gpu->FreeHostMemory(hPatternWeightsCache);
201 gpu->FreeHostMemory(hDistanceQueue);
203 gpu->FreeHostMemory(hWeightsCache);
204 gpu->FreeHostMemory(hFrequenciesCache);
205 gpu->FreeHostMemory(hPartialsCache);
206 gpu->FreeHostMemory(hStatesCache);
208 gpu->FreeHostMemory(hLogLikelihoodsCache);
209 gpu->FreeHostMemory(hMatrixCache);
221 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::createInstance(
int tipCount,
222 int partialsBufferCount,
223 int compactBufferCount,
226 int eigenDecompositionCount,
229 int scaleBufferCount,
231 long preferenceFlags,
232 long requirementFlags) {
234 #ifdef BEAGLE_DEBUG_FLOW
235 fprintf(stderr,
"\tEntering BeagleGPUImpl::createInstance\n");
240 kTipCount = tipCount;
241 kPartialsBufferCount = partialsBufferCount;
242 kCompactBufferCount = compactBufferCount;
243 kStateCount = stateCount;
244 kPatternCount = patternCount;
245 kEigenDecompCount = eigenDecompositionCount;
246 kMatrixCount = matrixCount;
247 kCategoryCount = categoryCount;
248 kScaleBufferCount = scaleBufferCount;
250 resourceNumber = iResourceNumber;
252 kTipPartialsBufferCount = kTipCount - kCompactBufferCount;
253 kBufferCount = kPartialsBufferCount + kCompactBufferCount;
255 kInternalPartialsBufferCount = kBufferCount - kTipCount;
257 if (kStateCount <= 4)
258 kPaddedStateCount = 4;
259 else if (kStateCount <= 16)
260 kPaddedStateCount = 16;
261 else if (kStateCount <= 32)
262 kPaddedStateCount = 32;
263 else if (kStateCount <= 48)
264 kPaddedStateCount = 48;
265 else if (kStateCount <= 64)
266 kPaddedStateCount = 64;
267 else if (kStateCount <= 80)
268 kPaddedStateCount = 80;
269 else if (kStateCount <= 128)
270 kPaddedStateCount = 128;
271 else if (kStateCount <= 192)
272 kPaddedStateCount = 192;
274 kPaddedStateCount = kStateCount + kStateCount % 16;
277 int paddedPatterns = 0;
278 if (kPaddedStateCount == 4 && kPatternCount % 4 != 0)
279 paddedPatterns = 4 - kPatternCount % 4;
283 kPaddedPatternCount = kPatternCount + paddedPatterns;
285 int resultPaddedPatterns = 0;
286 int patternBlockSizeFour = (kFlags & BEAGLE_FLAG_PRECISION_DOUBLE ? PATTERN_BLOCK_SIZE_DP_4 : PATTERN_BLOCK_SIZE_SP_4);
287 if (kPaddedStateCount == 4 && kPaddedPatternCount % patternBlockSizeFour != 0)
288 resultPaddedPatterns = patternBlockSizeFour - kPaddedPatternCount % patternBlockSizeFour;
290 kScaleBufferSize = kPaddedPatternCount;
294 if (preferenceFlags & BEAGLE_FLAG_SCALING_AUTO || requirementFlags & BEAGLE_FLAG_SCALING_AUTO) {
295 kFlags |= BEAGLE_FLAG_SCALING_AUTO;
296 kFlags |= BEAGLE_FLAG_SCALERS_LOG;
297 kScaleBufferCount = kInternalPartialsBufferCount;
298 kScaleBufferSize *= kCategoryCount;
299 }
else if (preferenceFlags & BEAGLE_FLAG_SCALING_ALWAYS || requirementFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
300 kFlags |= BEAGLE_FLAG_SCALING_ALWAYS;
301 kFlags |= BEAGLE_FLAG_SCALERS_LOG;
302 kScaleBufferCount = kInternalPartialsBufferCount + 1;
303 }
else if (preferenceFlags & BEAGLE_FLAG_SCALING_DYNAMIC || requirementFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
304 kFlags |= BEAGLE_FLAG_SCALING_DYNAMIC;
305 kFlags |= BEAGLE_FLAG_SCALERS_RAW;
306 }
else if (preferenceFlags & BEAGLE_FLAG_SCALERS_LOG || requirementFlags & BEAGLE_FLAG_SCALERS_LOG) {
307 kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
308 kFlags |= BEAGLE_FLAG_SCALERS_LOG;
310 kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
311 kFlags |= BEAGLE_FLAG_SCALERS_RAW;
314 if (preferenceFlags & BEAGLE_FLAG_EIGEN_COMPLEX || requirementFlags & BEAGLE_FLAG_EIGEN_COMPLEX) {
315 kFlags |= BEAGLE_FLAG_EIGEN_COMPLEX;
317 kFlags |= BEAGLE_FLAG_EIGEN_REAL;
320 if (requirementFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED || preferenceFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED)
321 kFlags |= BEAGLE_FLAG_INVEVEC_TRANSPOSED;
323 kFlags |= BEAGLE_FLAG_INVEVEC_STANDARD;
326 modifyFlagsForPrecision(&kFlags, r);
328 int sumSitesBlockSize = (kFlags & BEAGLE_FLAG_PRECISION_DOUBLE ? SUM_SITES_BLOCK_SIZE_DP : SUM_SITES_BLOCK_SIZE_SP);
329 kSumSitesBlockCount = kPatternCount / sumSitesBlockSize;
330 if (kPatternCount % sumSitesBlockSize != 0)
331 kSumSitesBlockCount += 1;
333 kPartialsSize = kPaddedPatternCount * kPaddedStateCount * kCategoryCount;
334 kMatrixSize = kPaddedStateCount * kPaddedStateCount;
336 if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
337 kEigenValuesSize = 2 * kPaddedStateCount;
339 kEigenValuesSize = kPaddedStateCount;
341 kLastCompactBufferIndex = -1;
342 kLastTipPartialsBufferIndex = -1;
349 numDevices = gpu->GetDeviceCount();
350 if (numDevices == 0) {
351 fprintf(stderr,
"Error: No GPU devices\n");
352 return BEAGLE_ERROR_NO_RESOURCE;
354 if (resourceNumber > numDevices) {
355 fprintf(stderr,
"Error: Trying to initialize device # %d (which does not exist)\n",resourceNumber);
356 return BEAGLE_ERROR_NO_RESOURCE;
360 gpu->SetDevice(resourceNumber-1,kPaddedStateCount,kCategoryCount,kPaddedPatternCount,kFlags);
362 int ptrQueueLength = kMatrixCount * kCategoryCount * 3;
363 if (kPartialsBufferCount > ptrQueueLength)
364 ptrQueueLength = kPartialsBufferCount;
366 unsigned int neededMemory =
sizeof(Real) * (kMatrixSize * kEigenDecompCount +
367 kMatrixSize * kEigenDecompCount +
368 kEigenValuesSize * kEigenDecompCount +
369 kCategoryCount * kPartialsBufferCount +
370 kPaddedStateCount * kPartialsBufferCount +
371 kPaddedPatternCount +
372 kPaddedPatternCount +
373 kPaddedPatternCount +
377 kScaleBufferCount * kPaddedPatternCount +
378 kPartialsBufferCount * kPartialsSize +
379 kMatrixCount * kMatrixSize * kCategoryCount +
381 kMatrixCount * kCategoryCount * 2) +
382 sizeof(int) * kCompactBufferCount * kPaddedPatternCount +
383 sizeof(GPUPtr) * ptrQueueLength;
386 unsigned int availableMem = gpu->GetAvailableMemory();
388 #ifdef BEAGLE_DEBUG_VALUES
389 fprintf(stderr,
" needed memory: %d\n", neededMemory);
390 fprintf(stderr,
" available memory: %d\n", availableMem);
393 if (availableMem < neededMemory)
394 return BEAGLE_ERROR_OUT_OF_MEMORY;
399 hWeightsCache = (Real*) gpu->CallocHost(kCategoryCount * kPartialsBufferCount,
sizeof(Real));
400 hFrequenciesCache = (Real*) gpu->CallocHost(kPaddedStateCount * kPartialsBufferCount,
sizeof(Real));
401 hPartialsCache = (Real*) gpu->CallocHost(kPartialsSize,
sizeof(Real));
402 hStatesCache = (
int*) gpu->CallocHost(kPaddedPatternCount,
sizeof(
int));
404 int hMatrixCacheSize = kMatrixSize * kCategoryCount * BEAGLE_CACHED_MATRICES_COUNT;
405 if ((2 * kMatrixSize + kEigenValuesSize) > hMatrixCacheSize)
406 hMatrixCacheSize = 2 * kMatrixSize + kEigenValuesSize;
408 hLogLikelihoodsCache = (Real*) gpu->MallocHost(kPatternCount *
sizeof(Real));
409 hMatrixCache = (Real*) gpu->CallocHost(hMatrixCacheSize,
sizeof(Real));
411 dEvec = (GPUPtr*) calloc(
sizeof(GPUPtr),kEigenDecompCount);
412 dIevc = (GPUPtr*) calloc(
sizeof(GPUPtr),kEigenDecompCount);
413 dEigenValues = (GPUPtr*) calloc(
sizeof(GPUPtr),kEigenDecompCount);
414 dWeights = (GPUPtr*) calloc(
sizeof(GPUPtr),kEigenDecompCount);
415 dFrequencies = (GPUPtr*) calloc(
sizeof(GPUPtr),kEigenDecompCount);
417 dMatrices = (GPUPtr*) malloc(
sizeof(GPUPtr) * kMatrixCount);
418 dMatrices[0] = gpu->AllocateMemory(kMatrixCount * kMatrixSize * kCategoryCount *
sizeof(Real));
420 for (
int i = 1; i < kMatrixCount; i++) {
421 dMatrices[i] = dMatrices[i-1] + kMatrixSize * kCategoryCount *
sizeof(Real);
424 if (kScaleBufferCount > 0) {
425 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
426 dScalingFactors = (GPUPtr*) malloc(
sizeof(GPUPtr) * kScaleBufferCount);
427 dScalingFactors[0] = gpu->AllocateMemory(
sizeof(
signed char) * kScaleBufferSize * kScaleBufferCount);
428 for (
int i=1; i < kScaleBufferCount; i++)
429 dScalingFactors[i] = dScalingFactors[i-1] + kScaleBufferSize *
sizeof(
signed char);
430 }
else if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
431 dScalingFactors = (GPUPtr*) calloc(
sizeof(GPUPtr), kScaleBufferCount);
432 dScalingFactorsMaster = (GPUPtr*) calloc(
sizeof(GPUPtr), kScaleBufferCount);
433 hRescalingTrigger = (
int*) gpu->AllocatePinnedHostMemory(
sizeof(
int),
false,
true);
434 dRescalingTrigger = gpu->GetDevicePointer((
void*) hRescalingTrigger);
436 dScalingFactors = (GPUPtr*) malloc(
sizeof(GPUPtr) * kScaleBufferCount);
437 dScalingFactors[0] = gpu->AllocateMemory(kScaleBufferSize * kScaleBufferCount *
sizeof(Real));
438 for (
int i=1; i < kScaleBufferCount; i++) {
439 dScalingFactors[i] = dScalingFactors[i-1] + kScaleBufferSize *
sizeof(Real);
444 for(
int i=0; i<kEigenDecompCount; i++) {
445 dEvec[i] = gpu->AllocateMemory(kMatrixSize *
sizeof(Real));
446 dIevc[i] = gpu->AllocateMemory(kMatrixSize *
sizeof(Real));
447 dEigenValues[i] = gpu->AllocateMemory(kEigenValuesSize *
sizeof(Real));
448 dWeights[i] = gpu->AllocateMemory(kCategoryCount *
sizeof(Real));
449 dFrequencies[i] = gpu->AllocateMemory(kPaddedStateCount *
sizeof(Real));
453 dIntegrationTmp = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) *
sizeof(Real));
454 dOutFirstDeriv = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) *
sizeof(Real));
455 dOutSecondDeriv = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) *
sizeof(Real));
457 dPatternWeights = gpu->AllocateMemory(kPatternCount *
sizeof(Real));
459 dSumLogLikelihood = gpu->AllocateMemory(kSumSitesBlockCount *
sizeof(Real));
460 dSumFirstDeriv = gpu->AllocateMemory(kSumSitesBlockCount *
sizeof(Real));
461 dSumSecondDeriv = gpu->AllocateMemory(kSumSitesBlockCount *
sizeof(Real));
463 dPartialsTmp = gpu->AllocateMemory(kPartialsSize *
sizeof(Real));
464 dFirstDerivTmp = gpu->AllocateMemory(kPartialsSize *
sizeof(Real));
465 dSecondDerivTmp = gpu->AllocateMemory(kPartialsSize *
sizeof(Real));
468 dPartials = (GPUPtr*) calloc(
sizeof(GPUPtr), kBufferCount);
471 dStates = (GPUPtr*) calloc(
sizeof(GPUPtr), kBufferCount);
473 dCompactBuffers = (GPUPtr*) malloc(
sizeof(GPUPtr) * kCompactBufferCount);
474 dTipPartialsBuffers = (GPUPtr*) malloc(
sizeof(GPUPtr) * kTipPartialsBufferCount);
476 for (
int i = 0; i < kBufferCount; i++) {
478 if (i < kCompactBufferCount)
479 dCompactBuffers[i] = gpu->AllocateMemory(kPaddedPatternCount *
sizeof(
int));
480 if (i < kTipPartialsBufferCount)
481 dTipPartialsBuffers[i] = gpu->AllocateMemory(kPartialsSize *
sizeof(Real));
483 dPartials[i] = gpu->AllocateMemory(kPartialsSize *
sizeof(Real));
487 kLastCompactBufferIndex = kCompactBufferCount - 1;
488 kLastTipPartialsBufferIndex = kTipPartialsBufferCount - 1;
491 dBranchLengths = gpu->AllocateMemory(kBufferCount *
sizeof(Real));
493 dDistanceQueue = gpu->AllocateMemory(kMatrixCount * kCategoryCount * 2 *
sizeof(Real));
494 hDistanceQueue = (Real*) gpu->MallocHost(
sizeof(Real) * kMatrixCount * kCategoryCount * 2);
495 checkHostMemory(hDistanceQueue);
497 dPtrQueue = gpu->AllocateMemory(
sizeof(
unsigned int) * ptrQueueLength);
498 hPtrQueue = (
unsigned int*) gpu->MallocHost(
sizeof(
unsigned int) * ptrQueueLength);
499 checkHostMemory(hPtrQueue);
501 hCategoryRates = (
double*) gpu->MallocHost(
sizeof(
double) * kCategoryCount);
502 checkHostMemory(hCategoryRates);
504 hPatternWeightsCache = (Real*) gpu->MallocHost(
sizeof(
double) * kPatternCount);
505 checkHostMemory(hPatternWeightsCache);
507 dMaxScalingFactors = gpu->AllocateMemory(kPaddedPatternCount *
sizeof(Real));
508 dIndexMaxScalingFactors = gpu->AllocateMemory(kPaddedPatternCount *
sizeof(
int));
510 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
511 dAccumulatedScalingFactors = gpu->AllocateMemory(
sizeof(
int) * kScaleBufferSize);
514 #ifdef BEAGLE_DEBUG_FLOW
515 fprintf(stderr,
"\tLeaving BeagleGPUImpl::createInstance\n");
520 #ifdef BEAGLE_DEBUG_VALUES
522 int usedMemory = availableMem - gpu->GetAvailableMemory();
523 fprintf(stderr,
"actual used memory: %d\n", usedMemory);
524 fprintf(stderr,
" difference: %d\n\n", usedMemory-neededMemory);
527 return BEAGLE_SUCCESS;
531 char* BeagleGPUImpl<double>::getInstanceName() {
532 return (
char*)
"CUDA-Double";
536 char* BeagleGPUImpl<float>::getInstanceName() {
537 return (
char*)
"CUDA-Single";
542 if (returnInfo != NULL) {
544 returnInfo->
flags = BEAGLE_FLAG_COMPUTATION_SYNCH |
545 BEAGLE_FLAG_THREADING_NONE |
546 BEAGLE_FLAG_VECTOR_NONE |
547 BEAGLE_FLAG_PROCESSOR_GPU;
549 modifyFlagsForPrecision(&(returnInfo->
flags), r);
551 returnInfo->
flags |= kFlags;
553 returnInfo->
implName = getInstanceName();
555 return BEAGLE_SUCCESS;
559 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTipStates(
int tipIndex,
560 const int* inStates) {
562 #ifdef BEAGLE_DEBUG_FLOW
563 fprintf(stderr,
"\tEntering BeagleGPUImpl::setTipStates\n");
566 if (tipIndex < 0 || tipIndex >= kTipCount)
567 return BEAGLE_ERROR_OUT_OF_RANGE;
569 for(
int i = 0; i < kPatternCount; i++)
570 hStatesCache[i] = (inStates[i] < kStateCount ? inStates[i] : kPaddedStateCount);
573 for(
int i = kPatternCount; i < kPaddedPatternCount; i++)
574 hStatesCache[i] = kPaddedStateCount;
576 if (dStates[tipIndex] == 0) {
577 assert(kLastCompactBufferIndex >= 0 && kLastCompactBufferIndex < kCompactBufferCount);
578 dStates[tipIndex] = dCompactBuffers[kLastCompactBufferIndex--];
581 gpu->MemcpyHostToDevice(dStates[tipIndex], hStatesCache,
sizeof(
int) * kPaddedPatternCount);
583 #ifdef BEAGLE_DEBUG_FLOW
584 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setTipStates\n");
587 return BEAGLE_SUCCESS;
591 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTipPartials(
int tipIndex,
592 const double* inPartials) {
593 #ifdef BEAGLE_DEBUG_FLOW
594 fprintf(stderr,
"\tEntering BeagleGPUImpl::setTipPartials\n");
597 if (tipIndex < 0 || tipIndex >= kTipCount)
598 return BEAGLE_ERROR_OUT_OF_RANGE;
600 const double* inPartialsOffset = inPartials;
601 Real* tmpRealPartialsOffset = hPartialsCache;
602 for (
int i = 0; i < kPatternCount; i++) {
608 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
609 tmpRealPartialsOffset += kPaddedStateCount;
610 inPartialsOffset += kStateCount;
613 int partialsLength = kPaddedPatternCount * kPaddedStateCount;
614 for (
int i = 1; i < kCategoryCount; i++) {
615 memcpy(hPartialsCache + i * partialsLength, hPartialsCache, partialsLength *
sizeof(Real));
618 if (tipIndex < kTipCount) {
619 if (dPartials[tipIndex] == 0) {
620 assert(kLastTipPartialsBufferIndex >= 0 && kLastTipPartialsBufferIndex <
621 kTipPartialsBufferCount);
622 dPartials[tipIndex] = dTipPartialsBuffers[kLastTipPartialsBufferIndex--];
626 gpu->MemcpyHostToDevice(dPartials[tipIndex], hPartialsCache,
sizeof(Real) * kPartialsSize);
628 #ifdef BEAGLE_DEBUG_FLOW
629 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setTipPartials\n");
632 return BEAGLE_SUCCESS;
636 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setPartials(
int bufferIndex,
637 const double* inPartials) {
638 #ifdef BEAGLE_DEBUG_FLOW
639 fprintf(stderr,
"\tEntering BeagleGPUImpl::setPartials\n");
642 if (bufferIndex < 0 || bufferIndex >= kPartialsBufferCount)
643 return BEAGLE_ERROR_OUT_OF_RANGE;
645 const double* inPartialsOffset = inPartials;
646 Real* tmpRealPartialsOffset = hPartialsCache;
647 for (
int l = 0; l < kCategoryCount; l++) {
648 for (
int i = 0; i < kPatternCount; i++) {
654 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
655 tmpRealPartialsOffset += kPaddedStateCount;
656 inPartialsOffset += kStateCount;
658 tmpRealPartialsOffset += kPaddedStateCount * (kPaddedPatternCount - kPatternCount);
661 if (bufferIndex < kTipCount) {
662 if (dPartials[bufferIndex] == 0) {
663 assert(kLastTipPartialsBufferIndex >= 0 && kLastTipPartialsBufferIndex <
664 kTipPartialsBufferCount);
665 dPartials[bufferIndex] = dTipPartialsBuffers[kLastTipPartialsBufferIndex--];
669 gpu->MemcpyHostToDevice(dPartials[bufferIndex], hPartialsCache,
sizeof(Real) * kPartialsSize);
671 #ifdef BEAGLE_DEBUG_FLOW
672 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setPartials\n");
675 return BEAGLE_SUCCESS;
679 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getPartials(
int bufferIndex,
681 double* outPartials) {
682 #ifdef BEAGLE_DEBUG_FLOW
683 fprintf(stderr,
"\tEntering BeagleGPUImpl::getPartials\n");
686 gpu->MemcpyDeviceToHost(hPartialsCache, dPartials[bufferIndex],
sizeof(Real) * kPartialsSize);
688 double* outPartialsOffset = outPartials;
689 Real* tmpRealPartialsOffset = hPartialsCache;
691 for (
int i = 0; i < kPatternCount; i++) {
697 beagleMemCpy(outPartialsOffset, tmpRealPartialsOffset, kStateCount);
698 tmpRealPartialsOffset += kPaddedStateCount;
699 outPartialsOffset += kStateCount;
702 #ifdef BEAGLE_DEBUG_FLOW
703 fprintf(stderr,
"\tLeaving BeagleGPUImpl::getPartials\n");
706 return BEAGLE_SUCCESS;
710 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setEigenDecomposition(
int eigenIndex,
711 const double* inEigenVectors,
712 const double* inInverseEigenVectors,
713 const double* inEigenValues) {
715 #ifdef BEAGLE_DEBUG_FLOW
716 fprintf(stderr,
"\tEntering BeagleGPUImpl::setEigenDecomposition\n");
722 Real* Ievc, * tmpIevc, * Evec, * tmpEvec, * Eval;
724 tmpIevc = Ievc = (Real*) hMatrixCache;
725 tmpEvec = Evec = Ievc + kMatrixSize;
726 Eval = Evec + kMatrixSize;
728 for (
int i = 0; i < kStateCount; i++) {
736 beagleMemCpy(tmpIevc, inInverseEigenVectors + i * kStateCount, kStateCount);
737 beagleMemCpy(tmpEvec, inEigenVectors + i * kStateCount, kStateCount);
738 tmpIevc += kPaddedStateCount;
739 tmpEvec += kPaddedStateCount;
744 if (kFlags & BEAGLE_FLAG_INVEVEC_STANDARD)
745 transposeSquareMatrix(Ievc, kPaddedStateCount);
746 transposeSquareMatrix(Evec, kPaddedStateCount);
757 beagleMemCpy(Eval, inEigenValues, kStateCount);
758 if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX) {
759 beagleMemCpy(Eval + kPaddedStateCount, inEigenValues + kStateCount, kStateCount);
762 #ifdef BEAGLE_DEBUG_VALUES
771 fprintf(stderr,
"Eval =\n");
772 printfVector(Eval, kEigenValuesSize);
773 fprintf(stderr,
"Evec =\n");
774 printfVector(Evec, kMatrixSize);
775 fprintf(stderr,
"Ievc =\n");
776 printfVector(Ievc, kPaddedStateCount * kPaddedStateCount);
781 gpu->MemcpyHostToDevice(dIevc[eigenIndex], Ievc,
sizeof(Real) * kMatrixSize);
782 gpu->MemcpyHostToDevice(dEvec[eigenIndex], Evec,
sizeof(Real) * kMatrixSize);
783 gpu->MemcpyHostToDevice(dEigenValues[eigenIndex], Eval,
sizeof(Real) * kEigenValuesSize);
785 #ifdef BEAGLE_DEBUG_VALUES
787 fprintf(stderr,
"dEigenValues =\n");
788 gpu->PrintfDeviceVector(dEigenValues[eigenIndex], kEigenValuesSize, r);
789 fprintf(stderr,
"dEvec =\n");
790 gpu->PrintfDeviceVector(dEvec[eigenIndex], kMatrixSize, r);
791 fprintf(stderr,
"dIevc =\n");
792 gpu->PrintfDeviceVector(dIevc[eigenIndex], kPaddedStateCount * kPaddedStateCount, r);
795 #ifdef BEAGLE_DEBUG_FLOW
796 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setEigenDecomposition\n");
799 return BEAGLE_SUCCESS;
803 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setStateFrequencies(
int stateFrequenciesIndex,
804 const double* inStateFrequencies) {
805 #ifdef BEAGLE_DEBUG_FLOW
806 fprintf(stderr,
"\tEntering BeagleGPUImpl::setStateFrequencies\n");
809 if (stateFrequenciesIndex < 0 || stateFrequenciesIndex >= kEigenDecompCount)
810 return BEAGLE_ERROR_OUT_OF_RANGE;
817 beagleMemCpy(hFrequenciesCache, inStateFrequencies, kStateCount);
819 gpu->MemcpyHostToDevice(dFrequencies[stateFrequenciesIndex], hFrequenciesCache,
820 sizeof(Real) * kPaddedStateCount);
822 #ifdef BEAGLE_DEBUG_FLOW
823 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setStateFrequencies\n");
826 return BEAGLE_SUCCESS;
830 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setCategoryWeights(
int categoryWeightsIndex,
831 const double* inCategoryWeights) {
832 #ifdef BEAGLE_DEBUG_FLOW
833 fprintf(stderr,
"\tEntering BeagleGPUImpl::setCategoryWeights\n");
836 if (categoryWeightsIndex < 0 || categoryWeightsIndex >= kEigenDecompCount)
837 return BEAGLE_ERROR_OUT_OF_RANGE;
845 const Real* tmpWeights = beagleCastIfNecessary(inCategoryWeights, hWeightsCache,
848 gpu->MemcpyHostToDevice(dWeights[categoryWeightsIndex], tmpWeights,
849 sizeof(Real) * kCategoryCount);
851 #ifdef BEAGLE_DEBUG_FLOW
852 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setCategoryWeights\n");
855 return BEAGLE_SUCCESS;
859 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setCategoryRates(
const double* inCategoryRates) {
861 #ifdef BEAGLE_DEBUG_FLOW
862 fprintf(stderr,
"\tEntering BeagleGPUImpl::updateCategoryRates\n");
865 const double* categoryRates = inCategoryRates;
868 memcpy(hCategoryRates, categoryRates,
sizeof(
double) * kCategoryCount);
870 #ifdef BEAGLE_DEBUG_FLOW
871 fprintf(stderr,
"\tLeaving BeagleGPUImpl::updateCategoryRates\n");
874 return BEAGLE_SUCCESS;
878 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setPatternWeights(
const double* inPatternWeights) {
880 #ifdef BEAGLE_DEBUG_FLOW
881 fprintf(stderr,
"\tEntering BeagleGPUImpl::setPatternWeights\n");
890 const Real* tmpWeights = beagleCastIfNecessary(inPatternWeights, hPatternWeightsCache, kPatternCount);
892 gpu->MemcpyHostToDevice(dPatternWeights, tmpWeights,
893 sizeof(Real) * kPatternCount);
895 #ifdef BEAGLE_DEBUG_FLOW
896 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setPatternWeights\n");
899 return BEAGLE_SUCCESS;
903 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getTransitionMatrix(
int matrixIndex,
905 #ifdef BEAGLE_DEBUG_FLOW
906 fprintf(stderr,
"\tEntering BeagleGPUImpl::getTransitionMatrix\n");
909 gpu->MemcpyDeviceToHost(hMatrixCache, dMatrices[matrixIndex],
sizeof(Real) * kMatrixSize * kCategoryCount);
911 double* outMatrixOffset = outMatrix;
912 Real* tmpRealMatrixOffset = hMatrixCache;
914 for (
int l = 0; l < kCategoryCount; l++) {
916 transposeSquareMatrix(tmpRealMatrixOffset, kPaddedStateCount);
918 for (
int i = 0; i < kStateCount; i++) {
924 beagleMemCpy(outMatrixOffset, tmpRealMatrixOffset, kStateCount);
925 tmpRealMatrixOffset += kPaddedStateCount;
926 outMatrixOffset += kStateCount;
928 tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
931 #ifdef BEAGLE_DEBUG_FLOW
932 fprintf(stderr,
"\tLeaving BeagleGPUImpl::getTransitionMatrix\n");
935 return BEAGLE_SUCCESS;
939 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTransitionMatrix(
int matrixIndex,
940 const double* inMatrix,
941 double paddedValue) {
943 #ifdef BEAGLE_DEBUG_FLOW
944 fprintf(stderr,
"\tEntering BeagleGPUImpl::setTransitionMatrix\n");
947 const double* inMatrixOffset = inMatrix;
948 Real* tmpRealMatrixOffset = hMatrixCache;
950 for (
int l = 0; l < kCategoryCount; l++) {
951 Real* transposeOffset = tmpRealMatrixOffset;
953 for (
int i = 0; i < kStateCount; i++) {
959 beagleMemCpy(tmpRealMatrixOffset, inMatrixOffset, kStateCount);
960 tmpRealMatrixOffset += kPaddedStateCount;
961 inMatrixOffset += kStateCount;
964 transposeSquareMatrix(transposeOffset, kPaddedStateCount);
965 tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
969 gpu->MemcpyHostToDevice(dMatrices[matrixIndex], hMatrixCache,
970 sizeof(Real) * kMatrixSize * kCategoryCount);
972 #ifdef BEAGLE_DEBUG_FLOW
973 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setTransitionMatrix\n");
976 return BEAGLE_SUCCESS;
980 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTransitionMatrices(
const int* matrixIndices,
981 const double* inMatrices,
982 const double* paddedValues,
985 #ifdef BEAGLE_DEBUG_FLOW
986 fprintf(stderr,
"\tEntering BeagleGPUImpl::setTransitionMatrices\n");
991 const double* inMatrixOffset = inMatrices + k*kStateCount*kStateCount*kCategoryCount;
992 Real* tmpRealMatrixOffset = hMatrixCache;
993 int lumpedMatricesCount = 0;
994 int matrixIndex = matrixIndices[k];
997 for (
int l = 0; l < kCategoryCount; l++) {
998 Real* transposeOffset = tmpRealMatrixOffset;
1000 for (
int i = 0; i < kStateCount; i++) {
1006 beagleMemCpy(tmpRealMatrixOffset, inMatrixOffset, kStateCount);
1007 tmpRealMatrixOffset += kPaddedStateCount;
1008 inMatrixOffset += kStateCount;
1011 transposeSquareMatrix(transposeOffset, kPaddedStateCount);
1012 tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
1015 lumpedMatricesCount++;
1017 }
while ((k < count) && (matrixIndices[k] == matrixIndices[k-1] + 1) && (lumpedMatricesCount < BEAGLE_CACHED_MATRICES_COUNT));
1020 gpu->MemcpyHostToDevice(dMatrices[matrixIndex], hMatrixCache,
1021 sizeof(Real) * kMatrixSize * kCategoryCount * lumpedMatricesCount);
1025 #ifdef BEAGLE_DEBUG_FLOW
1026 fprintf(stderr,
"\tLeaving BeagleGPUImpl::setTransitionMatrices\n");
1029 return BEAGLE_SUCCESS;
1033 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::updateTransitionMatrices(
int eigenIndex,
1034 const int* probabilityIndices,
1035 const int* firstDerivativeIndices,
1036 const int* secondDerivativeIndices,
1037 const double* edgeLengths,
1039 #ifdef BEAGLE_DEBUG_FLOW
1040 fprintf(stderr,
"\tEntering BeagleGPUImpl::updateTransitionMatrices\n");
1046 int indexOffset = kMatrixSize * kCategoryCount;
1047 int categoryOffset = kMatrixSize;
1051 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
1052 for (
int i = 0; i < count; i++) {
1053 for (
int j = 0; j < kCategoryCount; j++) {
1054 hPtrQueue[totalCount] = probabilityIndices[i] * indexOffset + j * categoryOffset;
1055 hDistanceQueue[totalCount] = (Real) (edgeLengths[i] * hCategoryRates[j]);
1060 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * totalCount);
1061 gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue,
sizeof(Real) * totalCount);
1064 kernels->GetTransitionProbabilitiesSquare(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
1065 dEigenValues[eigenIndex], dDistanceQueue, totalCount);
1066 }
else if (secondDerivativeIndices == NULL) {
1068 totalCount = count * kCategoryCount;
1070 for (
int i = 0; i < count; i++) {
1071 for (
int j = 0; j < kCategoryCount; j++) {
1072 hPtrQueue[ptrIndex] = probabilityIndices[i] * indexOffset + j * categoryOffset;
1073 hPtrQueue[ptrIndex + totalCount] = firstDerivativeIndices[i] * indexOffset + j * categoryOffset;
1074 hDistanceQueue[ptrIndex] = (Real) (edgeLengths[i]);
1075 hDistanceQueue[ptrIndex + totalCount] = (Real) (hCategoryRates[j]);
1080 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * totalCount * 2);
1081 gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue,
sizeof(Real) * totalCount * 2);
1083 kernels->GetTransitionProbabilitiesSquareFirstDeriv(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
1084 dEigenValues[eigenIndex], dDistanceQueue, totalCount);
1087 totalCount = count * kCategoryCount;
1089 for (
int i = 0; i < count; i++) {
1090 for (
int j = 0; j < kCategoryCount; j++) {
1091 hPtrQueue[ptrIndex] = probabilityIndices[i] * indexOffset + j * categoryOffset;
1092 hPtrQueue[ptrIndex + totalCount] = firstDerivativeIndices[i] * indexOffset + j * categoryOffset;
1093 hPtrQueue[ptrIndex + totalCount*2] = secondDerivativeIndices[i] * indexOffset + j * categoryOffset;
1094 hDistanceQueue[ptrIndex] = (Real) (edgeLengths[i]);
1095 hDistanceQueue[ptrIndex + totalCount] = (Real) (hCategoryRates[j]);
1100 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * totalCount * 3);
1101 gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue,
sizeof(Real) * totalCount * 2);
1103 kernels->GetTransitionProbabilitiesSquareSecondDeriv(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
1104 dEigenValues[eigenIndex], dDistanceQueue, totalCount);
1111 for (
int i = 0; i < count; i++) {
1112 for (
int j = 0; j < kCategoryCount; j++) {
1113 hDistanceQueue[totalCount] = (Real) (edgeLengths[i] * hCategoryRates[j]);
1118 gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue,
sizeof(Real) * totalCount);
1121 for (
int i = 0; i < count; i++) {
1122 kernels->GetTransitionProbabilitiesSquare(dMatrices[probabilityIndices[i]], dEvec[eigenIndex], dIevc[eigenIndex],
1123 dEigenValues[eigenIndex], dDistanceQueue, kCategoryCount,
1124 i * kCategoryCount);
1129 #ifdef BEAGLE_DEBUG_VALUES
1131 for (
int i = 0; i < 1; i++) {
1132 fprintf(stderr,
"dMatrices[probabilityIndices[%d]] (hDQ = %1.5e, eL = %1.5e) =\n", i,hDistanceQueue[i], edgeLengths[i]);
1133 gpu->PrintfDeviceVector(dMatrices[probabilityIndices[i]], kMatrixSize * kCategoryCount, r);
1134 for(
int j=0; j<kCategoryCount; j++)
1135 fprintf(stderr,
" %1.5f",hCategoryRates[j]);
1136 fprintf(stderr,
"\n");
1140 #ifdef BEAGLE_DEBUG_SYNCH
1144 #ifdef BEAGLE_DEBUG_FLOW
1145 fprintf(stderr,
"\tLeaving BeagleGPUImpl::updateTransitionMatrices\n");
1148 return BEAGLE_SUCCESS;
1152 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::updatePartials(
const int* operations,
1154 int cumulativeScalingIndex) {
1156 #ifdef BEAGLE_DEBUG_FLOW
1157 fprintf(stderr,
"\tEntering BeagleGPUImpl::updatePartials\n");
1160 GPUPtr cumulativeScalingBuffer = 0;
1161 if (cumulativeScalingIndex != BEAGLE_OP_NONE)
1162 cumulativeScalingBuffer = dScalingFactors[cumulativeScalingIndex];
1165 for (
int op = 0; op < operationCount; op++) {
1166 const int parIndex = operations[op * 7];
1167 const int writeScalingIndex = operations[op * 7 + 1];
1168 const int readScalingIndex = operations[op * 7 + 2];
1169 const int child1Index = operations[op * 7 + 3];
1170 const int child1TransMatIndex = operations[op * 7 + 4];
1171 const int child2Index = operations[op * 7 + 5];
1172 const int child2TransMatIndex = operations[op * 7 + 6];
1174 GPUPtr matrices1 = dMatrices[child1TransMatIndex];
1175 GPUPtr matrices2 = dMatrices[child2TransMatIndex];
1177 GPUPtr partials1 = dPartials[child1Index];
1178 GPUPtr partials2 = dPartials[child2Index];
1180 GPUPtr partials3 = dPartials[parIndex];
1182 GPUPtr tipStates1 = dStates[child1Index];
1183 GPUPtr tipStates2 = dStates[child2Index];
1185 int rescale = BEAGLE_OP_NONE;
1186 GPUPtr scalingFactors = (GPUPtr)NULL;
1188 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1189 int sIndex = parIndex - kTipCount;
1191 if (tipStates1 == 0 && tipStates2 == 0) {
1193 scalingFactors = dScalingFactors[sIndex];
1195 }
else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1197 scalingFactors = dScalingFactors[parIndex - kTipCount];
1198 }
else if ((kFlags & BEAGLE_FLAG_SCALING_MANUAL) && writeScalingIndex >= 0) {
1200 scalingFactors = dScalingFactors[writeScalingIndex];
1201 }
else if ((kFlags & BEAGLE_FLAG_SCALING_MANUAL) && readScalingIndex >= 0) {
1203 scalingFactors = dScalingFactors[readScalingIndex];
1206 #ifdef BEAGLE_DEBUG_VALUES
1207 fprintf(stderr,
"kPaddedPatternCount = %d\n", kPaddedPatternCount);
1208 fprintf(stderr,
"kPatternCount = %d\n", kPatternCount);
1209 fprintf(stderr,
"categoryCount = %d\n", kCategoryCount);
1210 fprintf(stderr,
"partialSize = %d\n", kPartialsSize);
1211 fprintf(stderr,
"writeIndex = %d, readIndex = %d, rescale = %d\n",writeScalingIndex,readScalingIndex,rescale);
1212 fprintf(stderr,
"child1 = \n");
1215 gpu->PrintfDeviceInt(tipStates1, kPaddedPatternCount);
1217 gpu->PrintfDeviceVector(partials1, kPartialsSize, r);
1218 fprintf(stderr,
"child2 = \n");
1220 gpu->PrintfDeviceInt(tipStates2, kPaddedPatternCount);
1222 gpu->PrintfDeviceVector(partials2, kPartialsSize, r);
1223 fprintf(stderr,
"node index = %d\n", parIndex);
1226 if (tipStates1 != 0) {
1227 if (tipStates2 != 0 ) {
1228 kernels->StatesStatesPruningDynamicScaling(tipStates1, tipStates2, partials3,
1229 matrices1, matrices2, scalingFactors,
1230 cumulativeScalingBuffer,
1231 kPaddedPatternCount, kCategoryCount,
1234 kernels->StatesPartialsPruningDynamicScaling(tipStates1, partials2, partials3,
1235 matrices1, matrices2, scalingFactors,
1236 cumulativeScalingBuffer,
1237 kPaddedPatternCount, kCategoryCount,
1241 if (tipStates2 != 0) {
1242 kernels->StatesPartialsPruningDynamicScaling(tipStates2, partials1, partials3,
1243 matrices2, matrices1, scalingFactors,
1244 cumulativeScalingBuffer,
1245 kPaddedPatternCount, kCategoryCount,
1248 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1249 kernels->PartialsPartialsPruningDynamicCheckScaling(partials1, partials2, partials3,
1250 matrices1, matrices2, writeScalingIndex, readScalingIndex,
1251 cumulativeScalingIndex, dScalingFactors, dScalingFactorsMaster,
1252 kPaddedPatternCount, kCategoryCount,
1253 rescale, hRescalingTrigger, dRescalingTrigger,
sizeof(Real));
1255 kernels->PartialsPartialsPruningDynamicScaling(partials1, partials2, partials3,
1256 matrices1, matrices2, scalingFactors,
1257 cumulativeScalingBuffer,
1258 kPaddedPatternCount, kCategoryCount,
1264 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1265 int parScalingIndex = parIndex - kTipCount;
1266 int child1ScalingIndex = child1Index - kTipCount;
1267 int child2ScalingIndex = child2Index - kTipCount;
1268 if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
1269 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
1270 accumulateScaleFactors(scalingIndices, 2, parScalingIndex);
1271 }
else if (child1ScalingIndex >= 0) {
1272 int scalingIndices[1] = {child1ScalingIndex};
1273 accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
1274 }
else if (child2ScalingIndex >= 0) {
1275 int scalingIndices[1] = {child2ScalingIndex};
1276 accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
1280 #ifdef BEAGLE_DEBUG_VALUES
1282 fprintf(stderr,
"scalars = ");
1283 gpu->PrintfDeviceVector(scalingFactors,kPaddedPatternCount, r);
1285 fprintf(stderr,
"parent = \n");
1287 if (writeScalingIndex == -1)
1288 gpu->PrintfDeviceVector(partials3, kPartialsSize, r);
1290 gpu->PrintfDeviceVector(partials3, kPartialsSize, 1.0, &signal, r);
1294 #ifdef BEAGLE_DEBUG_SYNCH
1298 #ifdef BEAGLE_DEBUG_FLOW
1299 fprintf(stderr,
"\tLeaving BeagleGPUImpl::updatePartials\n");
1302 return BEAGLE_SUCCESS;
1306 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::waitForPartials(
const int* ,
1308 #ifdef BEAGLE_DEBUG_FLOW
1309 fprintf(stderr,
"\tEntering BeagleGPUImpl::waitForPartials\n");
1312 #ifdef BEAGLE_DEBUG_FLOW
1313 fprintf(stderr,
"\tLeaving BeagleGPUImpl::waitForPartials\n");
1316 return BEAGLE_SUCCESS;
1320 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::accumulateScaleFactors(
const int* scalingIndices,
1322 int cumulativeScalingIndex) {
1323 #ifdef BEAGLE_DEBUG_FLOW
1324 fprintf(stderr,
"\tEntering BeagleGPUImpl::accumulateScaleFactors\n");
1327 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1328 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1329 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1331 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1335 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1337 for(
int n = 0; n < count; n++) {
1338 int sIndex = scalingIndices[n] - kTipCount;
1339 hPtrQueue[n] = sIndex;
1342 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1344 kernels->AccumulateFactorsAutoScaling(dScalingFactors[0], dPtrQueue, dAccumulatedScalingFactors, count, kPaddedPatternCount, kScaleBufferSize);
1347 for(
int n = 0; n < count; n++)
1348 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1349 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1354 kernels->AccumulateFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex], count, kPaddedPatternCount);
1356 for (
int i = 0; i < count; i++) {
1357 kernels->AccumulateFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1358 1, kPaddedPatternCount);
1363 #ifdef BEAGLE_DEBUG_SYNCH
1367 #ifdef BEAGLE_DEBUG_VALUES
1369 fprintf(stderr,
"scaling factors = ");
1370 gpu->PrintfDeviceVector(dScalingFactors[cumulativeScalingIndex], kPaddedPatternCount, r);
1373 #ifdef BEAGLE_DEBUG_FLOW
1374 fprintf(stderr,
"\tLeaving BeagleGPUImpl::accumulateScaleFactors\n");
1377 return BEAGLE_SUCCESS;
1381 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::removeScaleFactors(
const int* scalingIndices,
1383 int cumulativeScalingIndex) {
1385 #ifdef BEAGLE_DEBUG_FLOW
1386 fprintf(stderr,
"\tEntering BeagleGPUImpl::removeScaleFactors\n");
1389 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391 gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex],
sizeof(Real)*kScaleBufferSize);
1393 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1397 for(
int n = 0; n < count; n++)
1398 hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1403 kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404 count, kPaddedPatternCount);
1406 for (
int i = 0; i < count; i++) {
1407 kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408 1, kPaddedPatternCount);
1413 #ifdef BEAGLE_DEBUG_SYNCH
1417 #ifdef BEAGLE_DEBUG_FLOW
1418 fprintf(stderr,
"\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1421 return BEAGLE_SUCCESS;
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(
int cumulativeScalingIndex) {
1427 #ifdef BEAGLE_DEBUG_FLOW
1428 fprintf(stderr,
"\tEntering BeagleGPUImpl::resetScaleFactors\n");
1431 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432 if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433 dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1435 if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436 dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize *
sizeof(Real));
1437 dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1441 Real* zeroes = (Real*) gpu->CallocHost(
sizeof(Real), kPaddedPatternCount);
1444 gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445 sizeof(Real) * kPaddedPatternCount);
1447 gpu->FreeHostMemory(zeroes);
1449 #ifdef BEAGLE_DEBUG_SYNCH
1453 #ifdef BEAGLE_DEBUG_FLOW
1454 fprintf(stderr,
"\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1457 return BEAGLE_SUCCESS;
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(
int destScalingIndex,
1462 int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464 fprintf(stderr,
"\tEntering BeagleGPUImpl::copyScaleFactors\n");
1467 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1468 dScalingFactors[destScalingIndex] = dScalingFactors[srcScalingIndex];
1470 gpu->MemcpyDeviceToDevice(dScalingFactors[destScalingIndex], dScalingFactors[srcScalingIndex],
sizeof(Real)*kScaleBufferSize);
1472 #ifdef BEAGLE_DEBUG_SYNCH
1476 #ifdef BEAGLE_DEBUG_FLOW
1477 fprintf(stderr,
"\tLeaving BeagleGPUImpl::copyScaleFactors\n");
1480 return BEAGLE_SUCCESS;
1484 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::calculateRootLogLikelihoods(
const int* bufferIndices,
1485 const int* categoryWeightsIndices,
1486 const int* stateFrequenciesIndices,
1487 const int* cumulativeScaleIndices,
1489 double* outSumLogLikelihood) {
1491 #ifdef BEAGLE_DEBUG_FLOW
1492 fprintf(stderr,
"\tEntering BeagleGPUImpl::calculateRootLogLikelihoods\n");
1495 int returnCode = BEAGLE_SUCCESS;
1498 const int rootNodeIndex = bufferIndices[0];
1499 const int categoryWeightsIndex = categoryWeightsIndices[0];
1500 const int stateFrequenciesIndex = stateFrequenciesIndices[0];
1503 GPUPtr dCumulativeScalingFactor;
1505 if (kFlags & BEAGLE_FLAG_SCALING_AUTO)
1506 dCumulativeScalingFactor = dAccumulatedScalingFactors;
1507 else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
1508 dCumulativeScalingFactor = dScalingFactors[bufferIndices[0] - kTipCount];
1509 else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE)
1510 dCumulativeScalingFactor = dScalingFactors[cumulativeScaleIndices[0]];
1514 #ifdef BEAGLE_DEBUG_VALUES
1516 fprintf(stderr,
"root partials = \n");
1517 gpu->PrintfDeviceVector(dPartials[rootNodeIndex], kPaddedPatternCount, r);
1521 kernels->IntegrateLikelihoodsDynamicScaling(dIntegrationTmp, dPartials[rootNodeIndex],
1522 dWeights[categoryWeightsIndex],
1523 dFrequencies[stateFrequenciesIndex],
1524 dCumulativeScalingFactor,
1525 kPaddedPatternCount,
1528 kernels->IntegrateLikelihoods(dIntegrationTmp, dPartials[rootNodeIndex],
1529 dWeights[categoryWeightsIndex],
1530 dFrequencies[stateFrequenciesIndex],
1531 kPaddedPatternCount, kCategoryCount);
1534 #ifdef BEAGLE_DEBUG_VALUES
1535 fprintf(stderr,
"before SumSites1 = \n");
1536 gpu->PrintfDeviceVector(dIntegrationTmp, kPaddedPatternCount, r);
1539 kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights,
1542 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood,
sizeof(Real) * kSumSitesBlockCount);
1544 *outSumLogLikelihood = 0.0;
1545 for (
int i = 0; i < kSumSitesBlockCount; i++) {
1546 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
1547 returnCode = BEAGLE_ERROR_FLOATING_POINT;
1549 *outSumLogLikelihood += hLogLikelihoodsCache[i];
1555 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1556 for(
int n = 0; n < count; n++) {
1557 int cumulativeScalingFactor = bufferIndices[n] - kTipCount;
1558 hPtrQueue[n] = cumulativeScalingFactor * kScaleBufferSize;
1560 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1561 }
else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) {
1562 for(
int n = 0; n < count; n++)
1563 hPtrQueue[n] = cumulativeScaleIndices[n] * kScaleBufferSize;
1564 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1567 for (
int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
1569 const GPUPtr tmpDWeights = dWeights[categoryWeightsIndices[subsetIndex]];
1570 const GPUPtr tmpDFrequencies = dFrequencies[stateFrequenciesIndices[subsetIndex]];
1571 const int rootNodeIndex = bufferIndices[subsetIndex];
1573 if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
1574 kernels->IntegrateLikelihoodsFixedScaleMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights,
1575 tmpDFrequencies, dScalingFactors[0], dPtrQueue, dMaxScalingFactors,
1576 dIndexMaxScalingFactors,
1577 kPaddedPatternCount,
1578 kCategoryCount, count, subsetIndex);
1580 if (subsetIndex == 0) {
1581 kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights,
1583 kPaddedPatternCount, kCategoryCount, 0);
1584 }
else if (subsetIndex == count - 1) {
1585 kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights,
1587 kPaddedPatternCount, kCategoryCount, 1);
1589 kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights,
1591 kPaddedPatternCount, kCategoryCount, 2);
1596 kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights,
1599 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood,
sizeof(Real) * kSumSitesBlockCount);
1601 *outSumLogLikelihood = 0.0;
1602 for (
int i = 0; i < kSumSitesBlockCount; i++) {
1603 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
1604 returnCode = BEAGLE_ERROR_FLOATING_POINT;
1606 *outSumLogLikelihood += hLogLikelihoodsCache[i];
1611 #ifdef BEAGLE_DEBUG_VALUES
1613 fprintf(stderr,
"parent = \n");
1614 gpu->PrintfDeviceVector(dIntegrationTmp, kPatternCount, r);
1618 #ifdef BEAGLE_DEBUG_FLOW
1619 fprintf(stderr,
"\tLeaving BeagleGPUImpl::calculateRootLogLikelihoods\n");
1626 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::calculateEdgeLogLikelihoods(
const int* parentBufferIndices,
1627 const int* childBufferIndices,
1628 const int* probabilityIndices,
1629 const int* firstDerivativeIndices,
1630 const int* secondDerivativeIndices,
1631 const int* categoryWeightsIndices,
1632 const int* stateFrequenciesIndices,
1633 const int* cumulativeScaleIndices,
1635 double* outSumLogLikelihood,
1636 double* outSumFirstDerivative,
1637 double* outSumSecondDerivative) {
1639 #ifdef BEAGLE_DEBUG_FLOW
1640 fprintf(stderr,
"\tEntering BeagleGPUImpl::calculateEdgeLogLikelihoods\n");
1643 int returnCode = BEAGLE_SUCCESS;
1648 const int parIndex = parentBufferIndices[0];
1649 const int childIndex = childBufferIndices[0];
1650 const int probIndex = probabilityIndices[0];
1652 const int categoryWeightsIndex = categoryWeightsIndices[0];
1653 const int stateFrequenciesIndex = stateFrequenciesIndices[0];
1656 GPUPtr partialsParent = dPartials[parIndex];
1657 GPUPtr partialsChild = dPartials[childIndex];
1658 GPUPtr statesChild = dStates[childIndex];
1659 GPUPtr transMatrix = dMatrices[probIndex];
1662 GPUPtr dCumulativeScalingFactor;
1664 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1665 dCumulativeScalingFactor = dAccumulatedScalingFactors;
1666 }
else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1667 int cumulativeScalingFactor = kInternalPartialsBufferCount;
1668 int child1ScalingIndex = parIndex - kTipCount;
1669 int child2ScalingIndex = childIndex - kTipCount;
1670 resetScaleFactors(cumulativeScalingFactor);
1671 if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
1672 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
1673 accumulateScaleFactors(scalingIndices, 2, cumulativeScalingFactor);
1674 }
else if (child1ScalingIndex >= 0) {
1675 int scalingIndices[1] = {child1ScalingIndex};
1676 accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactor);
1677 }
else if (child2ScalingIndex >= 0) {
1678 int scalingIndices[1] = {child2ScalingIndex};
1679 accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactor);
1681 dCumulativeScalingFactor = dScalingFactors[cumulativeScalingFactor];
1682 }
else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) {
1683 dCumulativeScalingFactor = dScalingFactors[cumulativeScaleIndices[0]];
1688 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
1689 if (statesChild != 0) {
1690 kernels->StatesPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, statesChild,
1691 transMatrix, kPaddedPatternCount,
1694 kernels->PartialsPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, partialsChild,
1695 transMatrix, kPaddedPatternCount,
1701 kernels->IntegrateLikelihoodsDynamicScaling(dIntegrationTmp, dPartialsTmp, dWeights[categoryWeightsIndex],
1702 dFrequencies[stateFrequenciesIndex],
1703 dCumulativeScalingFactor,
1704 kPaddedPatternCount, kCategoryCount);
1706 kernels->IntegrateLikelihoods(dIntegrationTmp, dPartialsTmp, dWeights[categoryWeightsIndex],
1707 dFrequencies[stateFrequenciesIndex],
1708 kPaddedPatternCount, kCategoryCount);
1711 kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights,
1714 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood,
sizeof(Real) * kSumSitesBlockCount);
1716 *outSumLogLikelihood = 0.0;
1717 for (
int i = 0; i < kSumSitesBlockCount; i++) {
1718 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
1719 returnCode = BEAGLE_ERROR_FLOATING_POINT;
1721 *outSumLogLikelihood += hLogLikelihoodsCache[i];
1723 }
else if (secondDerivativeIndices == NULL) {
1726 const int firstDerivIndex = firstDerivativeIndices[0];
1727 GPUPtr firstDerivMatrix = dMatrices[firstDerivIndex];
1728 GPUPtr secondDerivMatrix = dMatrices[firstDerivIndex];
1730 if (statesChild != 0) {
1732 kernels->StatesPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1733 partialsParent, statesChild,
1734 transMatrix, firstDerivMatrix, secondDerivMatrix,
1735 kPaddedPatternCount, kCategoryCount);
1737 kernels->PartialsPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1738 partialsParent, partialsChild,
1739 transMatrix, firstDerivMatrix, secondDerivMatrix,
1740 kPaddedPatternCount, kCategoryCount);
1745 kernels->IntegrateLikelihoodsDynamicScalingSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv,
1746 dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1747 dWeights[categoryWeightsIndex],
1748 dFrequencies[stateFrequenciesIndex],
1749 dCumulativeScalingFactor,
1750 kPaddedPatternCount, kCategoryCount);
1752 kernels->IntegrateLikelihoodsSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv,
1753 dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1754 dWeights[categoryWeightsIndex],
1755 dFrequencies[stateFrequenciesIndex],
1756 kPaddedPatternCount, kCategoryCount);
1760 kernels->SumSites2(dIntegrationTmp, dSumLogLikelihood, dOutFirstDeriv, dSumFirstDeriv, dPatternWeights,
1763 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood,
sizeof(Real) * kSumSitesBlockCount);
1765 *outSumLogLikelihood = 0.0;
1766 for (
int i = 0; i < kSumSitesBlockCount; i++) {
1767 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
1768 returnCode = BEAGLE_ERROR_FLOATING_POINT;
1770 *outSumLogLikelihood += hLogLikelihoodsCache[i];
1773 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumFirstDeriv,
sizeof(Real) * kSumSitesBlockCount);
1775 *outSumFirstDerivative = 0.0;
1776 for (
int i = 0; i < kSumSitesBlockCount; i++) {
1777 *outSumFirstDerivative += hLogLikelihoodsCache[i];
1783 const int firstDerivIndex = firstDerivativeIndices[0];
1784 const int secondDerivIndex = secondDerivativeIndices[0];
1785 GPUPtr firstDerivMatrix = dMatrices[firstDerivIndex];
1786 GPUPtr secondDerivMatrix = dMatrices[secondDerivIndex];
1788 if (statesChild != 0) {
1790 kernels->StatesPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1791 partialsParent, statesChild,
1792 transMatrix, firstDerivMatrix, secondDerivMatrix,
1793 kPaddedPatternCount, kCategoryCount);
1795 kernels->PartialsPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1796 partialsParent, partialsChild,
1797 transMatrix, firstDerivMatrix, secondDerivMatrix,
1798 kPaddedPatternCount, kCategoryCount);
1803 kernels->IntegrateLikelihoodsDynamicScalingSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv,
1804 dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1805 dWeights[categoryWeightsIndex],
1806 dFrequencies[stateFrequenciesIndex],
1807 dCumulativeScalingFactor,
1808 kPaddedPatternCount, kCategoryCount);
1810 kernels->IntegrateLikelihoodsSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv,
1811 dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1812 dWeights[categoryWeightsIndex],
1813 dFrequencies[stateFrequenciesIndex],
1814 kPaddedPatternCount, kCategoryCount);
1817 kernels->SumSites3(dIntegrationTmp, dSumLogLikelihood, dOutFirstDeriv, dSumFirstDeriv, dOutSecondDeriv, dSumSecondDeriv, dPatternWeights,
1820 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood,
sizeof(Real) * kSumSitesBlockCount);
1822 *outSumLogLikelihood = 0.0;
1823 for (
int i = 0; i < kSumSitesBlockCount; i++) {
1824 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
1825 returnCode = BEAGLE_ERROR_FLOATING_POINT;
1827 *outSumLogLikelihood += hLogLikelihoodsCache[i];
1830 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumFirstDeriv,
sizeof(Real) * kSumSitesBlockCount);
1832 *outSumFirstDerivative = 0.0;
1833 for (
int i = 0; i < kSumSitesBlockCount; i++) {
1834 *outSumFirstDerivative += hLogLikelihoodsCache[i];
1837 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumSecondDeriv,
sizeof(Real) * kSumSitesBlockCount);
1839 *outSumSecondDerivative = 0.0;
1840 for (
int i = 0; i < kSumSitesBlockCount; i++) {
1841 *outSumSecondDerivative += hLogLikelihoodsCache[i];
1847 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
1849 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1850 fprintf(stderr,
"BeagleGPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and SCALING_ALWAYS\n");
1851 }
else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) {
1852 for(
int n = 0; n < count; n++)
1853 hPtrQueue[n] = cumulativeScaleIndices[n] * kScaleBufferSize;
1854 gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue,
sizeof(
unsigned int) * count);
1857 for (
int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
1859 const int parIndex = parentBufferIndices[subsetIndex];
1860 const int childIndex = childBufferIndices[subsetIndex];
1861 const int probIndex = probabilityIndices[subsetIndex];
1863 GPUPtr partialsParent = dPartials[parIndex];
1864 GPUPtr partialsChild = dPartials[childIndex];
1865 GPUPtr statesChild = dStates[childIndex];
1866 GPUPtr transMatrix = dMatrices[probIndex];
1868 const GPUPtr tmpDWeights = dWeights[categoryWeightsIndices[subsetIndex]];
1869 const GPUPtr tmpDFrequencies = dFrequencies[stateFrequenciesIndices[subsetIndex]];
1871 if (statesChild != 0) {
1872 kernels->StatesPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, statesChild,
1873 transMatrix, kPaddedPatternCount,
1876 kernels->PartialsPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, partialsChild,
1877 transMatrix, kPaddedPatternCount,
1881 if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) {
1882 kernels->IntegrateLikelihoodsFixedScaleMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights,
1883 tmpDFrequencies, dScalingFactors[0], dPtrQueue, dMaxScalingFactors,
1884 dIndexMaxScalingFactors,
1885 kPaddedPatternCount,
1886 kCategoryCount, count, subsetIndex);
1888 if (subsetIndex == 0) {
1889 kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights,
1891 kPaddedPatternCount, kCategoryCount, 0);
1892 }
else if (subsetIndex == count - 1) {
1893 kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights,
1895 kPaddedPatternCount, kCategoryCount, 1);
1897 kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights,
1899 kPaddedPatternCount, kCategoryCount, 2);
1903 kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights,
1906 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood,
sizeof(Real) * kSumSitesBlockCount);
1908 *outSumLogLikelihood = 0.0;
1909 for (
int i = 0; i < kSumSitesBlockCount; i++) {
1910 if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
1911 returnCode = BEAGLE_ERROR_FLOATING_POINT;
1913 *outSumLogLikelihood += hLogLikelihoodsCache[i];
1918 fprintf(stderr,
"BeagleGPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and derivatives\n");
1919 returnCode = BEAGLE_ERROR_GENERAL;
1924 #ifdef BEAGLE_DEBUG_FLOW
1925 fprintf(stderr,
"\tLeaving BeagleGPUImpl::calculateEdgeLogLikelihoods\n");
1932 int BeagleGPUImpl<float>::getSiteLogLikelihoods(
double* outLogLikelihoods) {
1934 #ifdef BEAGLE_DEBUG_FLOW
1935 fprintf(stderr,
"\tEntering BeagleGPUImpl::getSiteLogLikelihoods\n");
1941 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dIntegrationTmp,
sizeof(
float) * kPatternCount);
1943 beagleMemCpy(outLogLikelihoods, hLogLikelihoodsCache, kPatternCount);
1946 #ifdef BEAGLE_DEBUG_FLOW
1947 fprintf(stderr,
"\tLeaving BeagleGPUImpl::getSiteLogLikelihoods\n");
1950 return BEAGLE_SUCCESS;
1954 int BeagleGPUImpl<double>::getSiteLogLikelihoods(
double* outLogLikelihoods) {
1956 #ifdef BEAGLE_DEBUG_FLOW
1957 fprintf(stderr,
"\tEntering BeagleGPUImpl::getSiteLogLikelihoods\n");
1961 gpu->MemcpyDeviceToHost(outLogLikelihoods, dIntegrationTmp,
sizeof(
double) * kPatternCount);
1967 #ifdef BEAGLE_DEBUG_FLOW
1968 fprintf(stderr,
"\tLeaving BeagleGPUImpl::getSiteLogLikelihoods\n");
1971 return BEAGLE_SUCCESS;
1975 int BeagleGPUImpl<double>::getSiteDerivatives(
double* outFirstDerivatives,
1976 double* outSecondDerivatives) {
1978 #ifdef BEAGLE_DEBUG_FLOW
1979 fprintf(stderr,
"\tEntering BeagleGPUImpl::getSiteDerivatives\n");
1983 gpu->MemcpyDeviceToHost(outFirstDerivatives, dOutFirstDeriv,
sizeof(
double) * kPatternCount);
1989 if (outSecondDerivatives != NULL) {
1991 gpu->MemcpyDeviceToHost(outSecondDerivatives, dOutSecondDeriv,
sizeof(
double) * kPatternCount);
1999 #ifdef BEAGLE_DEBUG_FLOW
2000 fprintf(stderr,
"\tLeaving BeagleGPUImpl::getSiteDerivatives\n");
2003 return BEAGLE_SUCCESS;
2007 int BeagleGPUImpl<float>::getSiteDerivatives(
double* outFirstDerivatives,
2008 double* outSecondDerivatives) {
2010 #ifdef BEAGLE_DEBUG_FLOW
2011 fprintf(stderr,
"\tEntering BeagleGPUImpl::getSiteDerivatives\n");
2017 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutFirstDeriv,
sizeof(
float) * kPatternCount);
2018 beagleMemCpy(outFirstDerivatives, hLogLikelihoodsCache, kPatternCount);
2022 if (outSecondDerivatives != NULL) {
2026 gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutSecondDeriv,
sizeof(
float) * kPatternCount);
2027 beagleMemCpy(outSecondDerivatives, hLogLikelihoodsCache, kPatternCount);
2033 #ifdef BEAGLE_DEBUG_FLOW
2034 fprintf(stderr,
"\tLeaving BeagleGPUImpl::getSiteDerivatives\n");
2037 return BEAGLE_SUCCESS;
2044 BeagleImpl* BeagleGPUImplFactory<BEAGLE_GPU_GENERIC>::createImpl(
int tipCount,
2045 int partialsBufferCount,
2046 int compactBufferCount,
2049 int eigenBufferCount,
2050 int matrixBufferCount,
2052 int scaleBufferCount,
2054 long preferenceFlags,
2055 long requirementFlags,
2057 BeagleImpl* impl =
new BeagleGPUImpl<BEAGLE_GPU_GENERIC>();
2060 impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
2061 patternCount, eigenBufferCount, matrixBufferCount,
2062 categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags);
2063 if (*errorCode == BEAGLE_SUCCESS) {
2072 *errorCode = BEAGLE_ERROR_GENERAL;
2076 *errorCode = BEAGLE_ERROR_GENERAL;
2081 const char* BeagleGPUImplFactory<double>::getName() {
2086 const char* BeagleGPUImplFactory<float>::getName() {
2091 void modifyFlagsForPrecision(
long *flags,
double r) {
2092 *flags |= BEAGLE_FLAG_PRECISION_DOUBLE;
2096 void modifyFlagsForPrecision(
long *flags,
float r) {
2097 *flags |= BEAGLE_FLAG_PRECISION_SINGLE;
2101 const long BeagleGPUImplFactory<BEAGLE_GPU_GENERIC>::getFlags() {
2102 long flags = BEAGLE_FLAG_COMPUTATION_SYNCH |
2103 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | BEAGLE_FLAG_SCALING_DYNAMIC |
2104 BEAGLE_FLAG_THREADING_NONE |
2105 BEAGLE_FLAG_VECTOR_NONE |
2106 BEAGLE_FLAG_PROCESSOR_GPU |
2107 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
2108 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
2109 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
2112 modifyFlagsForPrecision(&flags, r);