68 #ifndef BEAGLE_CPU_IMPL_GENERAL_HPP
69 #define BEAGLE_CPU_IMPL_GENERAL_HPP
72 #include "libhmsbeagle/config.h"
84 #include "libhmsbeagle/beagle.h"
85 #include "libhmsbeagle/CPU/Precision.h"
86 #include "libhmsbeagle/CPU/BeagleCPUImpl.h"
87 #include "libhmsbeagle/CPU/EigenDecompositionCube.h"
88 #include "libhmsbeagle/CPU/EigenDecompositionSquare.h"
99 BEAGLE_CPU_FACTORY_TEMPLATE
100 inline const char* getBeagleCPUName(){
return "CPU-Unknown"; };
103 inline const char* getBeagleCPUName<double>(){
return "CPU-Double"; };
106 inline const char* getBeagleCPUName<float>(){
return "CPU-Single"; };
108 BEAGLE_CPU_FACTORY_TEMPLATE
109 inline const long getBeagleCPUFlags(){
return BEAGLE_FLAG_COMPUTATION_SYNCH; };
112 inline const long getBeagleCPUFlags<double>(){
return BEAGLE_FLAG_COMPUTATION_SYNCH |
113 BEAGLE_FLAG_THREADING_NONE |
114 BEAGLE_FLAG_PROCESSOR_CPU |
115 BEAGLE_FLAG_PRECISION_DOUBLE |
116 BEAGLE_FLAG_VECTOR_NONE; };
119 inline const long getBeagleCPUFlags<float>(){
return BEAGLE_FLAG_COMPUTATION_SYNCH |
120 BEAGLE_FLAG_THREADING_NONE |
121 BEAGLE_FLAG_PROCESSOR_CPU |
122 BEAGLE_FLAG_PRECISION_SINGLE |
123 BEAGLE_FLAG_VECTOR_NONE; };
128 BeagleCPUImpl<BEAGLE_CPU_GENERIC>::~BeagleCPUImpl() {
133 for(
unsigned int i=0; i<kEigenDecompCount; i++) {
134 if (gCategoryWeights[i] != NULL)
135 free(gCategoryWeights[i]);
136 if (gStateFrequencies[i] != NULL)
137 free(gStateFrequencies[i]);
140 for(
unsigned int i=0; i<kMatrixCount; i++) {
141 if (gTransitionMatrices[i] != NULL)
142 free(gTransitionMatrices[i]);
144 free(gTransitionMatrices);
146 for(
unsigned int i=0; i<kBufferCount; i++) {
147 if (gPartials[i] != NULL)
149 if (gTipStates[i] != NULL)
155 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
156 for(
unsigned int i=0; i<kScaleBufferCount; i++) {
157 if (gAutoScaleBuffers[i] != NULL)
158 free(gAutoScaleBuffers[i]);
160 if (gAutoScaleBuffers)
161 free(gAutoScaleBuffers);
162 free(gActiveScalingFactors);
163 if (gScaleBuffers[0] != NULL)
164 free(gScaleBuffers[0]);
166 for(
unsigned int i=0; i<kScaleBufferCount; i++) {
167 if (gScaleBuffers[i] != NULL)
168 free(gScaleBuffers[i]);
175 free(gCategoryRates);
176 free(gPatternWeights);
178 free(integrationTmp);
180 free(secondDerivTmp);
182 free(outLogLikelihoodsTmp);
183 free(outFirstDerivativesTmp);
184 free(outSecondDerivativesTmp);
189 delete gEigenDecomposition;
193 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::createInstance(
int tipCount,
194 int partialsBufferCount,
195 int compactBufferCount,
198 int eigenDecompositionCount,
201 int scaleBufferCount,
203 long preferenceFlags,
204 long requirementFlags) {
205 if (DEBUGGING_OUTPUT)
206 std::cerr <<
"in BeagleCPUImpl::initialize\n" ;
208 if (DOUBLE_PRECISION) {
209 realtypeMin = DBL_MIN;
210 scalingExponentThreshhold = 200;
212 realtypeMin = FLT_MIN;
213 scalingExponentThreshhold = 20;
216 kBufferCount = partialsBufferCount + compactBufferCount;
217 kTipCount = tipCount;
218 assert(kBufferCount > kTipCount);
219 kStateCount = stateCount;
220 kPatternCount = patternCount;
222 kInternalPartialsBufferCount = kBufferCount - kTipCount;
224 kTransPaddedStateCount = kStateCount + T_PAD;
225 kPartialsPaddedStateCount = kStateCount + P_PAD;
228 int modulus = getPaddedPatternsModulus();
229 kPaddedPatternCount = kPatternCount;
230 int remainder = kPatternCount % modulus;
231 if (remainder != 0) {
232 kPaddedPatternCount += modulus - remainder;
234 kExtraPatterns = kPaddedPatternCount - kPatternCount;
236 kMatrixCount = matrixCount;
237 kEigenDecompCount = eigenDecompositionCount;
238 kCategoryCount = categoryCount;
239 kScaleBufferCount = scaleBufferCount;
241 kMatrixSize = (T_PAD + kStateCount) * kStateCount;
243 int scaleBufferSize = kPaddedPatternCount;
247 if (preferenceFlags & BEAGLE_FLAG_SCALING_AUTO || requirementFlags & BEAGLE_FLAG_SCALING_AUTO) {
248 kFlags |= BEAGLE_FLAG_SCALING_AUTO;
249 kFlags |= BEAGLE_FLAG_SCALERS_LOG;
250 kScaleBufferCount = kInternalPartialsBufferCount;
251 }
else if (preferenceFlags & BEAGLE_FLAG_SCALING_ALWAYS || requirementFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
252 kFlags |= BEAGLE_FLAG_SCALING_ALWAYS;
253 kFlags |= BEAGLE_FLAG_SCALERS_LOG;
254 kScaleBufferCount = kInternalPartialsBufferCount + 1;
255 }
else if (preferenceFlags & BEAGLE_FLAG_SCALING_DYNAMIC || requirementFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
256 kFlags |= BEAGLE_FLAG_SCALING_DYNAMIC;
257 kFlags |= BEAGLE_FLAG_SCALERS_RAW;
258 }
else if (preferenceFlags & BEAGLE_FLAG_SCALERS_LOG || requirementFlags & BEAGLE_FLAG_SCALERS_LOG) {
259 kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
260 kFlags |= BEAGLE_FLAG_SCALERS_LOG;
262 kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
263 kFlags |= BEAGLE_FLAG_SCALERS_RAW;
266 if (requirementFlags & BEAGLE_FLAG_EIGEN_COMPLEX || preferenceFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
267 kFlags |= BEAGLE_FLAG_EIGEN_COMPLEX;
269 kFlags |= BEAGLE_FLAG_EIGEN_REAL;
271 if (requirementFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED || preferenceFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED)
272 kFlags |= BEAGLE_FLAG_INVEVEC_TRANSPOSED;
274 kFlags |= BEAGLE_FLAG_INVEVEC_STANDARD;
276 if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
277 gEigenDecomposition =
new EigenDecompositionSquare<BEAGLE_CPU_EIGEN_GENERIC>(kEigenDecompCount,
278 kStateCount,kCategoryCount,kFlags);
280 gEigenDecomposition =
new EigenDecompositionCube<BEAGLE_CPU_EIGEN_GENERIC>(kEigenDecompCount,
281 kStateCount, kCategoryCount,kFlags);
283 gCategoryRates = (
double*) malloc(
sizeof(
double) * kCategoryCount);
284 if (gCategoryRates == NULL)
285 throw std::bad_alloc();
287 gPatternWeights = (
double*) malloc(
sizeof(
double) * kPatternCount);
288 if (gPatternWeights == NULL)
289 throw std::bad_alloc();
292 kPartialsSize = kPaddedPatternCount * kPartialsPaddedStateCount * kCategoryCount;
294 gPartials = (REALTYPE**) malloc(
sizeof(REALTYPE*) * kBufferCount);
295 if (gPartials == NULL)
296 throw std::bad_alloc();
298 gStateFrequencies = (REALTYPE**) calloc(
sizeof(REALTYPE*), kEigenDecompCount);
299 if (gStateFrequencies == NULL)
300 throw std::bad_alloc();
302 gCategoryWeights = (REALTYPE**) calloc(
sizeof(REALTYPE*), kEigenDecompCount);
303 if (gCategoryWeights == NULL)
304 throw std::bad_alloc();
308 gTipStates = (
int**) malloc(
sizeof(
int*) * kBufferCount);
309 if (gTipStates == NULL)
310 throw std::bad_alloc();
312 for (
int i = 0; i < kBufferCount; i++) {
314 gTipStates[i] = NULL;
317 for (
int i = kTipCount; i < kBufferCount; i++) {
318 gPartials[i] = (REALTYPE*) mallocAligned(
sizeof(REALTYPE) * kPartialsSize);
319 if (gPartials[i] == NULL)
320 throw std::bad_alloc();
323 gScaleBuffers = NULL;
325 gAutoScaleBuffers = NULL;
327 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
328 gAutoScaleBuffers = (
signed short**) malloc(
sizeof(
signed short*) * kScaleBufferCount);
329 if (gAutoScaleBuffers == NULL)
330 throw std::bad_alloc();
331 for (
int i = 0; i < kScaleBufferCount; i++) {
332 gAutoScaleBuffers[i] = (
signed short*) malloc(
sizeof(
signed short) * scaleBufferSize);
333 if (gAutoScaleBuffers[i] == 0L)
334 throw std::bad_alloc();
336 gActiveScalingFactors = (
int*) malloc(
sizeof(
int) * kInternalPartialsBufferCount);
337 gScaleBuffers = (REALTYPE**) malloc(
sizeof(REALTYPE*));
338 gScaleBuffers[0] = (REALTYPE*) malloc(
sizeof(REALTYPE) * scaleBufferSize);
340 gScaleBuffers = (REALTYPE**) malloc(
sizeof(REALTYPE*) * kScaleBufferCount);
341 if (gScaleBuffers == NULL)
342 throw std::bad_alloc();
344 for (
int i = 0; i < kScaleBufferCount; i++) {
345 gScaleBuffers[i] = (REALTYPE*) malloc(
sizeof(REALTYPE) * scaleBufferSize);
347 if (gScaleBuffers[i] == 0L)
348 throw std::bad_alloc();
351 if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
352 for (
int j=0; j < scaleBufferSize; j++) {
353 gScaleBuffers[i][j] = 1.0;
360 gTransitionMatrices = (REALTYPE**) malloc(
sizeof(REALTYPE*) * kMatrixCount);
361 if (gTransitionMatrices == NULL)
362 throw std::bad_alloc();
363 for (
int i = 0; i < kMatrixCount; i++) {
364 gTransitionMatrices[i] = (REALTYPE*) mallocAligned(
sizeof(REALTYPE) * kMatrixSize * kCategoryCount);
365 if (gTransitionMatrices[i] == 0L)
366 throw std::bad_alloc();
369 integrationTmp = (REALTYPE*) mallocAligned(
sizeof(REALTYPE) * kPatternCount * kStateCount);
370 firstDerivTmp = (REALTYPE*) malloc(
sizeof(REALTYPE) * kPatternCount * kStateCount);
371 secondDerivTmp = (REALTYPE*) malloc(
sizeof(REALTYPE) * kPatternCount * kStateCount);
373 outLogLikelihoodsTmp = (REALTYPE*) malloc(
sizeof(REALTYPE) * kPatternCount * kStateCount);
374 outFirstDerivativesTmp = (REALTYPE*) malloc(
sizeof(REALTYPE) * kPatternCount * kStateCount);
375 outSecondDerivativesTmp = (REALTYPE*) malloc(
sizeof(REALTYPE) * kPatternCount * kStateCount);
377 zeros = (REALTYPE*) malloc(
sizeof(REALTYPE) * kPaddedPatternCount);
378 ones = (REALTYPE*) malloc(
sizeof(REALTYPE) * kPaddedPatternCount);
379 for(
int i = 0; i < kPaddedPatternCount; i++) {
384 return BEAGLE_SUCCESS;
388 const char* BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getName() {
389 return getBeagleCPUName<BEAGLE_CPU_FACTORY_GENERIC>();
393 const long BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getFlags() {
394 return getBeagleCPUFlags<BEAGLE_CPU_FACTORY_GENERIC>();
399 if (returnInfo != NULL) {
401 returnInfo->
flags = getFlags();
402 returnInfo->
flags |= kFlags;
404 returnInfo->
implName = (
char*) getName();
407 return BEAGLE_SUCCESS;
411 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTipStates(
int tipIndex,
412 const int* inStates) {
413 if (tipIndex < 0 || tipIndex >= kTipCount)
414 return BEAGLE_ERROR_OUT_OF_RANGE;
415 gTipStates[tipIndex] = (
int*) mallocAligned(
sizeof(
int) * kPaddedPatternCount);
417 for (
int j = 0; j < kPatternCount; j++) {
418 gTipStates[tipIndex][j] = (inStates[j] < kStateCount ? inStates[j] : kStateCount);
420 for (
int j = kPatternCount; j < kPaddedPatternCount; j++) {
421 gTipStates[tipIndex][j] = kStateCount;
424 return BEAGLE_SUCCESS;
428 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTipPartials(
int tipIndex,
429 const double* inPartials) {
430 if (tipIndex < 0 || tipIndex >= kTipCount)
431 return BEAGLE_ERROR_OUT_OF_RANGE;
432 if(gPartials[tipIndex] == NULL) {
433 gPartials[tipIndex] = (REALTYPE*) mallocAligned(
sizeof(REALTYPE) * kPartialsSize);
435 if (gPartials[tipIndex] == 0L)
436 return BEAGLE_ERROR_OUT_OF_MEMORY;
439 const double* inPartialsOffset;
440 REALTYPE* tmpRealPartialsOffset = gPartials[tipIndex];
441 for (
int l = 0; l < kCategoryCount; l++) {
442 inPartialsOffset = inPartials;
443 for (
int i = 0; i < kPatternCount; i++) {
444 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
445 tmpRealPartialsOffset += kPartialsPaddedStateCount;
446 inPartialsOffset += kStateCount;
449 for(
int k = 0; k < kPartialsPaddedStateCount * (kPaddedPatternCount - kPatternCount); k++) {
450 *tmpRealPartialsOffset++ = 0;
454 return BEAGLE_SUCCESS;
458 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setPartials(
int bufferIndex,
459 const double* inPartials) {
460 if (bufferIndex < 0 || bufferIndex >= kBufferCount)
461 return BEAGLE_ERROR_OUT_OF_RANGE;
462 if (gPartials[bufferIndex] == NULL) {
463 gPartials[bufferIndex] = (REALTYPE*) malloc(
sizeof(REALTYPE) * kPartialsSize);
464 if (gPartials[bufferIndex] == 0L)
465 return BEAGLE_ERROR_OUT_OF_MEMORY;
468 const double* inPartialsOffset = inPartials;
469 REALTYPE* tmpRealPartialsOffset = gPartials[bufferIndex];
470 for (
int l = 0; l < kCategoryCount; l++) {
471 for (
int i = 0; i < kPatternCount; i++) {
472 beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
473 tmpRealPartialsOffset += kPartialsPaddedStateCount;
474 inPartialsOffset += kStateCount;
477 for(
int k = 0; k < kPartialsPaddedStateCount * (kPaddedPatternCount - kPatternCount); k++) {
478 *tmpRealPartialsOffset++ = 0;
482 return BEAGLE_SUCCESS;
486 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getPartials(
int bufferIndex,
487 int cumulativeScaleIndex,
488 double* outPartials) {
492 if (bufferIndex < 0 || bufferIndex >= kBufferCount)
493 return BEAGLE_ERROR_OUT_OF_RANGE;
495 if (kPatternCount == kPaddedPatternCount) {
496 beagleMemCpy(outPartials, gPartials[bufferIndex], kPartialsSize);
498 double *offsetOutPartials;
499 REALTYPE* offsetBeaglePartials = gPartials[bufferIndex];
500 for(
int i = 0; i < kCategoryCount; i++) {
501 beagleMemCpy(offsetOutPartials,offsetBeaglePartials,
502 kPatternCount * kStateCount);
503 offsetOutPartials += kPatternCount * kStateCount;
504 offsetBeaglePartials += kPaddedPatternCount * kStateCount;
508 if (cumulativeScaleIndex != BEAGLE_OP_NONE) {
509 REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScaleIndex];
511 for(
int k=0; k<kPatternCount; k++) {
512 REALTYPE scaleFactor = exp(cumulativeScaleBuffer[k]);
513 for(
int i=0; i<kStateCount; i++) {
514 outPartials[index] *= scaleFactor;
521 return BEAGLE_SUCCESS;
525 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setEigenDecomposition(
int eigenIndex,
526 const double* inEigenVectors,
527 const double* inInverseEigenVectors,
528 const double* inEigenValues) {
530 gEigenDecomposition->setEigenDecomposition(eigenIndex, inEigenVectors, inInverseEigenVectors, inEigenValues);
531 return BEAGLE_SUCCESS;
535 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setCategoryRates(
const double* inCategoryRates) {
536 memcpy(gCategoryRates, inCategoryRates,
sizeof(
double) * kCategoryCount);
537 return BEAGLE_SUCCESS;
541 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setPatternWeights(
const double* inPatternWeights) {
542 assert(inPatternWeights != 0L);
543 memcpy(gPatternWeights, inPatternWeights,
sizeof(
double) * kPatternCount);
544 return BEAGLE_SUCCESS;
548 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setStateFrequencies(
int stateFrequenciesIndex,
549 const double* inStateFrequencies) {
550 if (stateFrequenciesIndex < 0 || stateFrequenciesIndex >= kEigenDecompCount)
551 return BEAGLE_ERROR_OUT_OF_RANGE;
552 if (gStateFrequencies[stateFrequenciesIndex] == NULL) {
553 gStateFrequencies[stateFrequenciesIndex] = (REALTYPE*) malloc(
sizeof(REALTYPE) * kStateCount);
554 if (gStateFrequencies[stateFrequenciesIndex] == 0L)
555 return BEAGLE_ERROR_OUT_OF_MEMORY;
557 beagleMemCpy(gStateFrequencies[stateFrequenciesIndex], inStateFrequencies, kStateCount);
559 return BEAGLE_SUCCESS;
563 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setCategoryWeights(
int categoryWeightsIndex,
564 const double* inCategoryWeights) {
565 if (categoryWeightsIndex < 0 || categoryWeightsIndex >= kEigenDecompCount)
566 return BEAGLE_ERROR_OUT_OF_RANGE;
567 if (gCategoryWeights[categoryWeightsIndex] == NULL) {
568 gCategoryWeights[categoryWeightsIndex] = (REALTYPE*) malloc(
sizeof(REALTYPE) * kCategoryCount);
569 if (gCategoryWeights[categoryWeightsIndex] == 0L)
570 return BEAGLE_ERROR_OUT_OF_MEMORY;
572 beagleMemCpy(gCategoryWeights[categoryWeightsIndex], inCategoryWeights, kCategoryCount);
574 return BEAGLE_SUCCESS;
578 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getTransitionMatrix(
int matrixIndex,
582 double* offsetOutMatrix = outMatrix;
583 REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex];
584 for(
int i = 0; i < kCategoryCount; i++) {
585 for(
int j = 0; j < kStateCount; j++) {
586 beagleMemCpy(offsetOutMatrix,offsetBeagleMatrix,kStateCount);
587 offsetBeagleMatrix += kTransPaddedStateCount;
588 offsetOutMatrix += kStateCount;
592 beagleMemCpy(outMatrix,gTransitionMatrices[matrixIndex],
593 kMatrixSize * kCategoryCount);
595 return BEAGLE_SUCCESS;
599 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getSiteLogLikelihoods(
double* outLogLikelihoods) {
600 beagleMemCpy(outLogLikelihoods, outLogLikelihoodsTmp, kPatternCount);
602 return BEAGLE_SUCCESS;
606 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getSiteDerivatives(
double* outFirstDerivatives,
607 double* outSecondDerivatives) {
608 beagleMemCpy(outFirstDerivatives, outFirstDerivativesTmp, kPatternCount);
609 if (outSecondDerivatives != NULL)
610 beagleMemCpy(outSecondDerivatives, outSecondDerivativesTmp, kPatternCount);
612 return BEAGLE_SUCCESS;
617 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTransitionMatrix(
int matrixIndex,
618 const double* inMatrix,
619 double paddedValue) {
622 const double* offsetInMatrix = inMatrix;
623 REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex];
624 for(
int i = 0; i < kCategoryCount; i++) {
625 for(
int j = 0; j < kStateCount; j++) {
626 beagleMemCpy(offsetBeagleMatrix, offsetInMatrix, kStateCount);
627 offsetBeagleMatrix[kStateCount] = paddedValue;
628 offsetBeagleMatrix += kTransPaddedStateCount;
629 offsetInMatrix += kStateCount;
633 beagleMemCpy(gTransitionMatrices[matrixIndex], inMatrix,
634 kMatrixSize * kCategoryCount);
636 return BEAGLE_SUCCESS;
640 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTransitionMatrices(
const int* matrixIndices,
641 const double* inMatrices,
642 const double* paddedValues,
644 for (
int k = 0; k < count; k++) {
645 const double* inMatrix = inMatrices + k*kStateCount*kStateCount*kCategoryCount;
646 int matrixIndex = matrixIndices[k];
649 const double* offsetInMatrix = inMatrix;
650 REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex];
651 for(
int i = 0; i < kCategoryCount; i++) {
652 for(
int j = 0; j < kStateCount; j++) {
653 beagleMemCpy(offsetBeagleMatrix, offsetInMatrix, kStateCount);
654 offsetBeagleMatrix[kStateCount] = paddedValues[k];
655 offsetBeagleMatrix += kTransPaddedStateCount;
656 offsetInMatrix += kStateCount;
660 beagleMemCpy(gTransitionMatrices[matrixIndex], inMatrix,
661 kMatrixSize * kCategoryCount);
665 return BEAGLE_SUCCESS;
669 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::updateTransitionMatrices(
int eigenIndex,
670 const int* probabilityIndices,
671 const int* firstDerivativeIndices,
672 const int* secondDerivativeIndices,
673 const double* edgeLengths,
675 gEigenDecomposition->updateTransitionMatrices(eigenIndex,probabilityIndices,firstDerivativeIndices,secondDerivativeIndices,
676 edgeLengths,gCategoryRates,gTransitionMatrices,count);
677 return BEAGLE_SUCCESS;
681 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::updatePartials(
const int* operations,
683 int cumulativeScaleIndex) {
685 REALTYPE* cumulativeScaleBuffer = NULL;
686 if (cumulativeScaleIndex != BEAGLE_OP_NONE)
687 cumulativeScaleBuffer = gScaleBuffers[cumulativeScaleIndex];
689 for (
int op = 0; op < count; op++) {
690 if (DEBUGGING_OUTPUT) {
691 std::cerr <<
"op[0]= " << operations[0] <<
"\n";
692 std::cerr <<
"op[1]= " << operations[1] <<
"\n";
693 std::cerr <<
"op[2]= " << operations[2] <<
"\n";
694 std::cerr <<
"op[3]= " << operations[3] <<
"\n";
695 std::cerr <<
"op[4]= " << operations[4] <<
"\n";
696 std::cerr <<
"op[5]= " << operations[5] <<
"\n";
697 std::cerr <<
"op[6]= " << operations[6] <<
"\n";
700 const int parIndex = operations[op * 7];
701 const int writeScalingIndex = operations[op * 7 + 1];
702 const int readScalingIndex = operations[op * 7 + 2];
703 const int child1Index = operations[op * 7 + 3];
704 const int child1TransMatIndex = operations[op * 7 + 4];
705 const int child2Index = operations[op * 7 + 5];
706 const int child2TransMatIndex = operations[op * 7 + 6];
708 const REALTYPE* partials1 = gPartials[child1Index];
709 const REALTYPE* partials2 = gPartials[child2Index];
711 const int* tipStates1 = gTipStates[child1Index];
712 const int* tipStates2 = gTipStates[child2Index];
714 const REALTYPE* matrices1 = gTransitionMatrices[child1TransMatIndex];
715 const REALTYPE* matrices2 = gTransitionMatrices[child2TransMatIndex];
717 REALTYPE* destPartials = gPartials[parIndex];
719 int rescale = BEAGLE_OP_NONE;
720 REALTYPE* scalingFactors = NULL;
722 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
723 gActiveScalingFactors[parIndex - kTipCount] = 0;
724 if (tipStates1 == 0 && tipStates2 == 0)
726 }
else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
728 scalingFactors = gScaleBuffers[parIndex - kTipCount];
729 }
else if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
730 if (tipStates1 == 0 && tipStates2 == 0) {
732 removeScaleFactors(&readScalingIndex, 1, cumulativeScaleIndex);
733 scalingFactors = gScaleBuffers[writeScalingIndex];
735 }
else if (writeScalingIndex >= 0) {
737 scalingFactors = gScaleBuffers[writeScalingIndex];
738 }
else if (readScalingIndex >= 0) {
740 scalingFactors = gScaleBuffers[readScalingIndex];
743 if (DEBUGGING_OUTPUT) {
744 std::cerr <<
"Rescale= " << rescale <<
" writeIndex= " << writeScalingIndex
745 <<
" readIndex = " << readScalingIndex <<
"\n";
748 if (tipStates1 != NULL) {
749 if (tipStates2 != NULL ) {
751 calcStatesStatesFixedScaling(destPartials, tipStates1, matrices1, tipStates2, matrices2,
755 calcStatesStates(destPartials, tipStates1, matrices1, tipStates2, matrices2);
757 rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
761 calcStatesPartialsFixedScaling(destPartials, tipStates1, matrices1, partials2, matrices2,
764 calcStatesPartials(destPartials, tipStates1, matrices1, partials2, matrices2);
766 rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
770 if (tipStates2 != NULL) {
772 calcStatesPartialsFixedScaling(destPartials,tipStates2,matrices2,partials1,matrices1,
775 calcStatesPartials(destPartials, tipStates2, matrices2, partials1, matrices1);
777 rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
781 int sIndex = parIndex - kTipCount;
782 calcPartialsPartialsAutoScaling(destPartials,partials1,matrices1,partials2,matrices2,
783 &gActiveScalingFactors[sIndex]);
784 if (gActiveScalingFactors[sIndex])
785 autoRescalePartials(destPartials, gAutoScaleBuffers[sIndex]);
787 }
else if (rescale == 0) {
788 calcPartialsPartialsFixedScaling(destPartials,partials1,matrices1,partials2,matrices2,
791 calcPartialsPartials(destPartials, partials1, matrices1, partials2, matrices2);
793 rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
798 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
799 int parScalingIndex = parIndex - kTipCount;
800 int child1ScalingIndex = child1Index - kTipCount;
801 int child2ScalingIndex = child2Index - kTipCount;
802 if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
803 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
804 accumulateScaleFactors(scalingIndices, 2, parScalingIndex);
805 }
else if (child1ScalingIndex >= 0) {
806 int scalingIndices[1] = {child1ScalingIndex};
807 accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
808 }
else if (child2ScalingIndex >= 0) {
809 int scalingIndices[1] = {child2ScalingIndex};
810 accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
814 if (DEBUGGING_OUTPUT) {
815 if (scalingFactors != NULL && rescale == 0) {
816 for(
int i=0; i<kPatternCount; i++)
817 fprintf(stderr,
"old scaleFactor[%d] = %.5f\n",i,scalingFactors[i]);
819 fprintf(stderr,
"Result partials:\n");
820 for(
int i = 0; i < kPartialsSize; i++)
821 fprintf(stderr,
"destP[%d] = %.5f\n",i,destPartials[i]);
825 return BEAGLE_SUCCESS;
830 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::waitForPartials(
const int* destinationPartials,
831 int destinationPartialsCount) {
832 return BEAGLE_SUCCESS;
837 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calculateRootLogLikelihoods(
const int* bufferIndices,
838 const int* categoryWeightsIndices,
839 const int* stateFrequenciesIndices,
840 const int* cumulativeScaleIndices,
842 double* outSumLogLikelihood) {
847 int cumulativeScalingFactorIndex;
848 if (kFlags & BEAGLE_FLAG_SCALING_AUTO)
849 cumulativeScalingFactorIndex = 0;
850 else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
851 cumulativeScalingFactorIndex = bufferIndices[0] - kTipCount;
853 cumulativeScalingFactorIndex = cumulativeScaleIndices[0];
854 return calcRootLogLikelihoods(bufferIndices[0], categoryWeightsIndices[0], stateFrequenciesIndices[0],
855 cumulativeScalingFactorIndex, outSumLogLikelihood);
859 return calcRootLogLikelihoodsMulti(bufferIndices, categoryWeightsIndices, stateFrequenciesIndices,
860 cumulativeScaleIndices, count, outSumLogLikelihood);
865 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoodsMulti(
const int* bufferIndices,
866 const int* categoryWeightsIndices,
867 const int* stateFrequenciesIndices,
868 const int* scaleBufferIndices,
870 double* outSumLogLikelihood) {
880 std::vector<int> indexMaxScale(kPatternCount);
881 std::vector<REALTYPE> maxScaleFactor(kPatternCount);
883 int returnCode = BEAGLE_SUCCESS;
885 for (
int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
886 const int rootPartialIndex = bufferIndices[subsetIndex];
887 const REALTYPE* rootPartials = gPartials[rootPartialIndex];
888 const REALTYPE* frequencies = gStateFrequencies[stateFrequenciesIndices[subsetIndex]];
889 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]];
892 for (
int k = 0; k < kPatternCount; k++) {
893 for (
int i = 0; i < kStateCount; i++) {
894 integrationTmp[u] = rootPartials[v] * (REALTYPE) wt[0];
900 for (
int l = 1; l < kCategoryCount; l++) {
902 for (
int k = 0; k < kPatternCount; k++) {
903 for (
int i = 0; i < kStateCount; i++) {
904 integrationTmp[u] += rootPartials[v] * (REALTYPE) wt[l];
912 for (
int k = 0; k < kPatternCount; k++) {
914 for (
int i = 0; i < kStateCount; i++) {
915 sum += ((REALTYPE)frequencies[i]) * integrationTmp[u];
920 if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
921 int cumulativeScalingFactorIndex;
922 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
923 cumulativeScalingFactorIndex = rootPartialIndex - kTipCount;
925 cumulativeScalingFactorIndex = scaleBufferIndices[subsetIndex];
927 const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex];
929 if (subsetIndex == 0) {
930 indexMaxScale[k] = 0;
931 maxScaleFactor[k] = cumulativeScaleFactors[k];
932 for (
int j = 1; j < count; j++) {
933 REALTYPE tmpScaleFactor;
934 if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
935 tmpScaleFactor = gScaleBuffers[bufferIndices[j] - kTipCount][k];
937 tmpScaleFactor = gScaleBuffers[scaleBufferIndices[j]][k];
939 if (tmpScaleFactor > maxScaleFactor[k]) {
940 indexMaxScale[k] = j;
941 maxScaleFactor[k] = tmpScaleFactor;
946 if (subsetIndex != indexMaxScale[k])
947 sum *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k]));
950 if (subsetIndex == 0) {
951 outLogLikelihoodsTmp[k] = sum;
952 }
else if (subsetIndex == count - 1) {
953 REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sum;
955 outLogLikelihoodsTmp[k] = log(tmpSum);
957 outLogLikelihoodsTmp[k] += sum;
962 if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
963 for(
int i=0; i<kPatternCount; i++)
964 outLogLikelihoodsTmp[i] += maxScaleFactor[i];
967 *outSumLogLikelihood = 0.0;
968 for (
int i = 0; i < kPatternCount; i++) {
969 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
972 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
973 returnCode = BEAGLE_ERROR_FLOATING_POINT;
980 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoods(
const int bufferIndex,
981 const int categoryWeightsIndex,
982 const int stateFrequenciesIndex,
983 const int scalingFactorsIndex,
984 double* outSumLogLikelihood) {
986 int returnCode = BEAGLE_SUCCESS;
988 const REALTYPE* rootPartials = gPartials[bufferIndex];
989 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
990 const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
993 for (
int k = 0; k < kPatternCount; k++) {
994 for (
int i = 0; i < kStateCount; i++) {
995 integrationTmp[u] = rootPartials[v] * (REALTYPE) wt[0];
1001 for (
int l = 1; l < kCategoryCount; l++) {
1003 for (
int k = 0; k < kPatternCount; k++) {
1004 for (
int i = 0; i < kStateCount; i++) {
1005 integrationTmp[u] += rootPartials[v] * (REALTYPE) wt[l];
1013 for (
int k = 0; k < kPatternCount; k++) {
1015 for (
int i = 0; i < kStateCount; i++) {
1016 sum += freqs[i] * integrationTmp[u];
1020 outLogLikelihoodsTmp[k] = log(sum);
1023 if (scalingFactorsIndex >= 0) {
1024 const REALTYPE* cumulativeScaleFactors = gScaleBuffers[scalingFactorsIndex];
1025 for(
int i=0; i<kPatternCount; i++) {
1026 outLogLikelihoodsTmp[i] += cumulativeScaleFactors[i];
1030 *outSumLogLikelihood = 0.0;
1031 for (
int i = 0; i < kPatternCount; i++) {
1032 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1035 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1036 returnCode = BEAGLE_ERROR_FLOATING_POINT;
1045 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateScaleFactors(
const int* scalingIndices,
1047 int cumulativeScalingIndex) {
1048 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1049 REALTYPE* cumulativeScaleBuffer = gScaleBuffers[0];
1050 for(
int j=0; j<kPatternCount; j++)
1051 cumulativeScaleBuffer[j] = 0;
1052 for(
int i=0; i<count; i++) {
1053 int sIndex = scalingIndices[i] - kTipCount;
1054 if (gActiveScalingFactors[sIndex]) {
1055 const signed short* scaleBuffer = gAutoScaleBuffers[sIndex];
1056 for(
int j=0; j<kPatternCount; j++) {
1057 cumulativeScaleBuffer[j] += M_LN2 * scaleBuffer[j];
1063 REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScalingIndex];
1064 for(
int i=0; i<count; i++) {
1065 const REALTYPE* scaleBuffer = gScaleBuffers[scalingIndices[i]];
1066 for(
int j=0; j<kPatternCount; j++) {
1067 if (kFlags & BEAGLE_FLAG_SCALERS_LOG)
1068 cumulativeScaleBuffer[j] += scaleBuffer[j];
1070 cumulativeScaleBuffer[j] += log(scaleBuffer[j]);
1074 if (DEBUGGING_OUTPUT) {
1075 fprintf(stderr,
"Accumulating %d scale buffers into #%d\n",count,cumulativeScalingIndex);
1076 for(
int j=0; j<kPatternCount; j++) {
1077 fprintf(stderr,
"cumulativeScaleBuffer[%d] = %2.5e\n",j,cumulativeScaleBuffer[j]);
1082 return BEAGLE_SUCCESS;
1086 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::removeScaleFactors(
const int* scalingIndices,
1088 int cumulativeScalingIndex) {
1089 REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScalingIndex];
1090 for(
int i=0; i<count; i++) {
1091 const REALTYPE* scaleBuffer = gScaleBuffers[scalingIndices[i]];
1092 for(
int j=0; j<kPatternCount; j++) {
1093 if (kFlags & BEAGLE_FLAG_SCALERS_LOG)
1094 cumulativeScaleBuffer[j] -= scaleBuffer[j];
1096 cumulativeScaleBuffer[j] -= log(scaleBuffer[j]);
1100 return BEAGLE_SUCCESS;
1104 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::resetScaleFactors(
int cumulativeScalingIndex) {
1107 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1108 memset(gScaleBuffers[cumulativeScalingIndex], 0,
sizeof(
signed short) * kPaddedPatternCount);
1110 memset(gScaleBuffers[cumulativeScalingIndex], 0,
sizeof(REALTYPE) * kPaddedPatternCount);
1112 return BEAGLE_SUCCESS;
1116 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::copyScaleFactors(
int destScalingIndex,
1117 int srcScalingIndex) {
1118 memcpy(gScaleBuffers[destScalingIndex],gScaleBuffers[srcScalingIndex],
sizeof(REALTYPE) * kPatternCount);
1120 return BEAGLE_SUCCESS;
1124 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calculateEdgeLogLikelihoods(
const int* parentBufferIndices,
1125 const int* childBufferIndices,
1126 const int* probabilityIndices,
1127 const int* firstDerivativeIndices,
1128 const int* secondDerivativeIndices,
1129 const int* categoryWeightsIndices,
1130 const int* stateFrequenciesIndices,
1131 const int* cumulativeScaleIndices,
1133 double* outSumLogLikelihood,
1134 double* outSumFirstDerivative,
1135 double* outSumSecondDerivative) {
1139 int cumulativeScalingFactorIndex;
1140 if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1141 cumulativeScalingFactorIndex = 0;
1142 }
else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1143 cumulativeScalingFactorIndex = kInternalPartialsBufferCount;
1144 int child1ScalingIndex = parentBufferIndices[0] - kTipCount;
1145 int child2ScalingIndex = childBufferIndices[0] - kTipCount;
1146 resetScaleFactors(cumulativeScalingFactorIndex);
1147 if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
1148 int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
1149 accumulateScaleFactors(scalingIndices, 2, cumulativeScalingFactorIndex);
1150 }
else if (child1ScalingIndex >= 0) {
1151 int scalingIndices[1] = {child1ScalingIndex};
1152 accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactorIndex);
1153 }
else if (child2ScalingIndex >= 0) {
1154 int scalingIndices[1] = {child2ScalingIndex};
1155 accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactorIndex);
1158 cumulativeScalingFactorIndex = cumulativeScaleIndices[0];
1160 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL)
1161 return calcEdgeLogLikelihoods(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0],
1162 categoryWeightsIndices[0], stateFrequenciesIndices[0], cumulativeScalingFactorIndex,
1163 outSumLogLikelihood);
1164 else if (secondDerivativeIndices == NULL)
1165 return calcEdgeLogLikelihoodsFirstDeriv(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0],
1166 firstDerivativeIndices[0], categoryWeightsIndices[0], stateFrequenciesIndices[0],
1167 cumulativeScalingFactorIndex, outSumLogLikelihood, outSumFirstDerivative);
1169 return calcEdgeLogLikelihoodsSecondDeriv(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0],
1170 firstDerivativeIndices[0], secondDerivativeIndices[0], categoryWeightsIndices[0],
1171 stateFrequenciesIndices[0], cumulativeScalingFactorIndex, outSumLogLikelihood,
1172 outSumFirstDerivative, outSumSecondDerivative);
1174 if ((kFlags & BEAGLE_FLAG_SCALING_AUTO) || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
1175 fprintf(stderr,
"BeagleCPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and auto/always scaling\n");
1178 if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
1179 return calcEdgeLogLikelihoodsMulti(parentBufferIndices, childBufferIndices, probabilityIndices,
1180 categoryWeightsIndices, stateFrequenciesIndices, cumulativeScaleIndices, count,
1181 outSumLogLikelihood);
1183 fprintf(stderr,
"BeagleCPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and derivatives\n");
1192 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoods(
const int parIndex,
1193 const int childIndex,
1194 const int probIndex,
1195 const int categoryWeightsIndex,
1196 const int stateFrequenciesIndex,
1197 const int scalingFactorsIndex,
1198 double* outSumLogLikelihood) {
1200 assert(parIndex >= kTipCount);
1202 int returnCode = BEAGLE_SUCCESS;
1204 const REALTYPE* partialsParent = gPartials[parIndex];
1205 const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
1206 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
1207 const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
1209 memset(integrationTmp, 0, (kPatternCount * kStateCount)*
sizeof(REALTYPE));
1212 if (childIndex < kTipCount && gTipStates[childIndex]) {
1214 const int* statesChild = gTipStates[childIndex];
1217 for(
int l = 0; l < kCategoryCount; l++) {
1219 const REALTYPE weight = wt[l];
1220 for(
int k = 0; k < kPatternCount; k++) {
1222 const int stateChild = statesChild[k];
1224 int w = l * kMatrixSize;
1225 for(
int i = 0; i < kStateCount; i++) {
1226 integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
1229 w += kTransPaddedStateCount;
1231 v += kPartialsPaddedStateCount;
1237 const REALTYPE* partialsChild = gPartials[childIndex];
1239 int stateCountModFour = (kStateCount / 4) * 4;
1241 for(
int l = 0; l < kCategoryCount; l++) {
1243 const REALTYPE weight = wt[l];
1244 for(
int k = 0; k < kPatternCount; k++) {
1245 int w = l * kMatrixSize;
1246 const REALTYPE* partialsChildPtr = &partialsChild[v];
1247 for(
int i = 0; i < kStateCount; i++) {
1248 double sumOverJA = 0.0, sumOverJB = 0.0;
1250 const REALTYPE* transMatrixPtr = &transMatrix[w];
1251 for (; j < stateCountModFour; j += 4) {
1252 sumOverJA += transMatrixPtr[j + 0] * partialsChildPtr[j + 0];
1253 sumOverJB += transMatrixPtr[j + 1] * partialsChildPtr[j + 1];
1254 sumOverJA += transMatrixPtr[j + 2] * partialsChildPtr[j + 2];
1255 sumOverJB += transMatrixPtr[j + 3] * partialsChildPtr[j + 3];
1258 for (; j < kStateCount; j++) {
1259 sumOverJA += transMatrixPtr[j] * partialsChildPtr[j];
1265 integrationTmp[u] += (sumOverJA + sumOverJB) * partialsParent[v + i] * weight;
1273 v += kPartialsPaddedStateCount;
1279 for(
int k = 0; k < kPatternCount; k++) {
1280 REALTYPE sumOverI = 0.0;
1281 for(
int i = 0; i < kStateCount; i++) {
1282 sumOverI += freqs[i] * integrationTmp[u];
1286 outLogLikelihoodsTmp[k] = log(sumOverI);
1290 if (scalingFactorsIndex != BEAGLE_OP_NONE) {
1291 const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
1292 for(
int k=0; k < kPatternCount; k++)
1293 outLogLikelihoodsTmp[k] += scalingFactors[k];
1296 *outSumLogLikelihood = 0.0;
1297 for (
int i = 0; i < kPatternCount; i++) {
1298 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1301 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1302 returnCode = BEAGLE_ERROR_FLOATING_POINT;
1308 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsMulti(
const int* parentBufferIndices,
1309 const int* childBufferIndices,
1310 const int* probabilityIndices,
1311 const int* categoryWeightsIndices,
1312 const int* stateFrequenciesIndices,
1313 const int* scalingFactorsIndices,
1315 double* outSumLogLikelihood) {
1317 std::vector<int> indexMaxScale(kPatternCount);
1318 std::vector<REALTYPE> maxScaleFactor(kPatternCount);
1320 int returnCode = BEAGLE_SUCCESS;
1322 for (
int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
1323 const REALTYPE* partialsParent = gPartials[parentBufferIndices[subsetIndex]];
1324 const REALTYPE* transMatrix = gTransitionMatrices[probabilityIndices[subsetIndex]];
1325 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]];
1326 const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndices[subsetIndex]];
1327 int childIndex = childBufferIndices[subsetIndex];
1329 memset(integrationTmp, 0, (kPatternCount * kStateCount)*
sizeof(REALTYPE));
1331 if (childIndex < kTipCount && gTipStates[childIndex]) {
1333 const int* statesChild = gTipStates[childIndex];
1336 for(
int l = 0; l < kCategoryCount; l++) {
1338 const REALTYPE weight = wt[l];
1339 for(
int k = 0; k < kPatternCount; k++) {
1341 const int stateChild = statesChild[k];
1343 int w = l * kMatrixSize;
1344 for(
int i = 0; i < kStateCount; i++) {
1345 integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
1348 w += kTransPaddedStateCount;
1350 v += kPartialsPaddedStateCount;
1354 const REALTYPE* partialsChild = gPartials[childIndex];
1356 int stateCountModFour = (kStateCount / 4) * 4;
1358 for(
int l = 0; l < kCategoryCount; l++) {
1360 const REALTYPE weight = wt[l];
1361 for(
int k = 0; k < kPatternCount; k++) {
1362 int w = l * kMatrixSize;
1363 const REALTYPE* partialsChildPtr = &partialsChild[v];
1364 for(
int i = 0; i < kStateCount; i++) {
1365 double sumOverJA = 0.0, sumOverJB = 0.0;
1367 const REALTYPE* transMatrixPtr = &transMatrix[w];
1368 for (; j < stateCountModFour; j += 4) {
1369 sumOverJA += transMatrixPtr[j + 0] * partialsChildPtr[j + 0];
1370 sumOverJB += transMatrixPtr[j + 1] * partialsChildPtr[j + 1];
1371 sumOverJA += transMatrixPtr[j + 2] * partialsChildPtr[j + 2];
1372 sumOverJB += transMatrixPtr[j + 3] * partialsChildPtr[j + 3];
1375 for (; j < kStateCount; j++) {
1376 sumOverJA += transMatrixPtr[j] * partialsChildPtr[j];
1382 integrationTmp[u] += (sumOverJA + sumOverJB) * partialsParent[v + i] * weight;
1390 v += kPartialsPaddedStateCount;
1396 for(
int k = 0; k < kPatternCount; k++) {
1397 REALTYPE sumOverI = 0.0;
1398 for(
int i = 0; i < kStateCount; i++) {
1399 sumOverI += freqs[i] * integrationTmp[u];
1403 if (scalingFactorsIndices[0] != BEAGLE_OP_NONE) {
1404 int cumulativeScalingFactorIndex;
1405 cumulativeScalingFactorIndex = scalingFactorsIndices[subsetIndex];
1407 const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex];
1409 if (subsetIndex == 0) {
1410 indexMaxScale[k] = 0;
1411 maxScaleFactor[k] = cumulativeScaleFactors[k];
1412 for (
int j = 1; j < count; j++) {
1413 REALTYPE tmpScaleFactor;
1414 tmpScaleFactor = gScaleBuffers[scalingFactorsIndices[j]][k];
1416 if (tmpScaleFactor > maxScaleFactor[k]) {
1417 indexMaxScale[k] = j;
1418 maxScaleFactor[k] = tmpScaleFactor;
1423 if (subsetIndex != indexMaxScale[k])
1424 sumOverI *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k]));
1429 if (subsetIndex == 0) {
1430 outLogLikelihoodsTmp[k] = sumOverI;
1431 }
else if (subsetIndex == count - 1) {
1432 REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sumOverI;
1434 outLogLikelihoodsTmp[k] = log(tmpSum);
1436 outLogLikelihoodsTmp[k] += sumOverI;
1443 if (scalingFactorsIndices[0] != BEAGLE_OP_NONE) {
1444 for(
int i=0; i<kPatternCount; i++)
1445 outLogLikelihoodsTmp[i] += maxScaleFactor[i];
1449 *outSumLogLikelihood = 0.0;
1450 for (
int i = 0; i < kPatternCount; i++) {
1451 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1454 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1455 returnCode = BEAGLE_ERROR_FLOATING_POINT;
1462 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsFirstDeriv(
const int parIndex,
1463 const int childIndex,
1464 const int probIndex,
1465 const int firstDerivativeIndex,
1466 const int categoryWeightsIndex,
1467 const int stateFrequenciesIndex,
1468 const int scalingFactorsIndex,
1469 double* outSumLogLikelihood,
1470 double* outSumFirstDerivative) {
1472 assert(parIndex >= kTipCount);
1474 int returnCode = BEAGLE_SUCCESS;
1476 const REALTYPE* partialsParent = gPartials[parIndex];
1477 const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
1478 const REALTYPE* firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex];
1479 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
1480 const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
1483 memset(integrationTmp, 0, (kPatternCount * kStateCount)*
sizeof(REALTYPE));
1484 memset(firstDerivTmp, 0, (kPatternCount * kStateCount)*
sizeof(REALTYPE));
1486 if (childIndex < kTipCount && gTipStates[childIndex]) {
1488 const int* statesChild = gTipStates[childIndex];
1491 for(
int l = 0; l < kCategoryCount; l++) {
1493 const REALTYPE weight = wt[l];
1494 for(
int k = 0; k < kPatternCount; k++) {
1496 const int stateChild = statesChild[k];
1498 int w = l * kMatrixSize;
1499 for(
int i = 0; i < kStateCount; i++) {
1500 integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
1501 firstDerivTmp[u] += firstDerivMatrix[w + stateChild] * partialsParent[v + i] * weight;
1504 w += kTransPaddedStateCount;
1506 v += kPartialsPaddedStateCount;
1512 const REALTYPE* partialsChild = gPartials[childIndex];
1515 for(
int l = 0; l < kCategoryCount; l++) {
1517 const REALTYPE weight = wt[l];
1518 for(
int k = 0; k < kPatternCount; k++) {
1519 int w = l * kMatrixSize;
1520 for(
int i = 0; i < kStateCount; i++) {
1521 double sumOverJ = 0.0;
1522 double sumOverJD1 = 0.0;
1523 for(
int j = 0; j < kStateCount; j++) {
1524 sumOverJ += transMatrix[w] * partialsChild[v + j];
1525 sumOverJD1 += firstDerivMatrix[w] * partialsChild[v + j];
1532 integrationTmp[u] += sumOverJ * partialsParent[v + i] * weight;
1533 firstDerivTmp[u] += sumOverJD1 * partialsParent[v + i] * weight;
1536 v += kPartialsPaddedStateCount;
1542 for(
int k = 0; k < kPatternCount; k++) {
1543 REALTYPE sumOverI = 0.0;
1544 REALTYPE sumOverID1 = 0.0;
1545 for(
int i = 0; i < kStateCount; i++) {
1546 sumOverI += freqs[i] * integrationTmp[u];
1547 sumOverID1 += freqs[i] * firstDerivTmp[u];
1551 outLogLikelihoodsTmp[k] = log(sumOverI);
1552 outFirstDerivativesTmp[k] = sumOverID1 / sumOverI;
1556 if (scalingFactorsIndex != BEAGLE_OP_NONE) {
1557 const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
1558 for(
int k=0; k < kPatternCount; k++)
1559 outLogLikelihoodsTmp[k] += scalingFactors[k];
1562 *outSumLogLikelihood = 0.0;
1563 *outSumFirstDerivative = 0.0;
1564 for (
int i = 0; i < kPatternCount; i++) {
1565 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1567 *outSumFirstDerivative += outFirstDerivativesTmp[i] * gPatternWeights[i];
1570 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1571 returnCode = BEAGLE_ERROR_FLOATING_POINT;
1577 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsSecondDeriv(
const int parIndex,
1578 const int childIndex,
1579 const int probIndex,
1580 const int firstDerivativeIndex,
1581 const int secondDerivativeIndex,
1582 const int categoryWeightsIndex,
1583 const int stateFrequenciesIndex,
1584 const int scalingFactorsIndex,
1585 double* outSumLogLikelihood,
1586 double* outSumFirstDerivative,
1587 double* outSumSecondDerivative) {
1589 assert(parIndex >= kTipCount);
1591 int returnCode = BEAGLE_SUCCESS;
1593 const REALTYPE* partialsParent = gPartials[parIndex];
1594 const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
1595 const REALTYPE* firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex];
1596 const REALTYPE* secondDerivMatrix = gTransitionMatrices[secondDerivativeIndex];
1597 const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
1598 const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
1601 memset(integrationTmp, 0, (kPatternCount * kStateCount)*
sizeof(REALTYPE));
1602 memset(firstDerivTmp, 0, (kPatternCount * kStateCount)*
sizeof(REALTYPE));
1603 memset(secondDerivTmp, 0, (kPatternCount * kStateCount)*
sizeof(REALTYPE));
1605 if (childIndex < kTipCount && gTipStates[childIndex]) {
1607 const int* statesChild = gTipStates[childIndex];
1610 for(
int l = 0; l < kCategoryCount; l++) {
1612 const REALTYPE weight = wt[l];
1613 for(
int k = 0; k < kPatternCount; k++) {
1615 const int stateChild = statesChild[k];
1617 int w = l * kMatrixSize;
1618 for(
int i = 0; i < kStateCount; i++) {
1619 integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
1620 firstDerivTmp[u] += firstDerivMatrix[w + stateChild] * partialsParent[v + i] * weight;
1621 secondDerivTmp[u] += secondDerivMatrix[w + stateChild] * partialsParent[v + i] * weight;
1624 w += kTransPaddedStateCount;
1626 v += kPartialsPaddedStateCount;
1632 const REALTYPE* partialsChild = gPartials[childIndex];
1635 for(
int l = 0; l < kCategoryCount; l++) {
1637 const REALTYPE weight = wt[l];
1638 for(
int k = 0; k < kPatternCount; k++) {
1639 int w = l * kMatrixSize;
1640 for(
int i = 0; i < kStateCount; i++) {
1641 double sumOverJ = 0.0;
1642 double sumOverJD1 = 0.0;
1643 double sumOverJD2 = 0.0;
1644 for(
int j = 0; j < kStateCount; j++) {
1645 sumOverJ += transMatrix[w] * partialsChild[v + j];
1646 sumOverJD1 += firstDerivMatrix[w] * partialsChild[v + j];
1647 sumOverJD2 += secondDerivMatrix[w] * partialsChild[v + j];
1654 integrationTmp[u] += sumOverJ * partialsParent[v + i] * weight;
1655 firstDerivTmp[u] += sumOverJD1 * partialsParent[v + i] * weight;
1656 secondDerivTmp[u] += sumOverJD2 * partialsParent[v + i] * weight;
1659 v += kPartialsPaddedStateCount;
1665 for(
int k = 0; k < kPatternCount; k++) {
1666 REALTYPE sumOverI = 0.0;
1667 REALTYPE sumOverID1 = 0.0;
1668 REALTYPE sumOverID2 = 0.0;
1669 for(
int i = 0; i < kStateCount; i++) {
1670 sumOverI += freqs[i] * integrationTmp[u];
1671 sumOverID1 += freqs[i] * firstDerivTmp[u];
1672 sumOverID2 += freqs[i] * secondDerivTmp[u];
1676 outLogLikelihoodsTmp[k] = log(sumOverI);
1677 outFirstDerivativesTmp[k] = sumOverID1 / sumOverI;
1678 outSecondDerivativesTmp[k] = sumOverID2 / sumOverI - outFirstDerivativesTmp[k] * outFirstDerivativesTmp[k];
1682 if (scalingFactorsIndex != BEAGLE_OP_NONE) {
1683 const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
1684 for(
int k=0; k < kPatternCount; k++)
1685 outLogLikelihoodsTmp[k] += scalingFactors[k];
1688 *outSumLogLikelihood = 0.0;
1689 *outSumFirstDerivative = 0.0;
1690 *outSumSecondDerivative = 0.0;
1691 for (
int i = 0; i < kPatternCount; i++) {
1692 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1694 *outSumFirstDerivative += outFirstDerivativesTmp[i] * gPatternWeights[i];
1696 *outSumSecondDerivative += outSecondDerivativesTmp[i] * gPatternWeights[i];
1699 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1700 returnCode = BEAGLE_ERROR_FLOATING_POINT;
1708 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::block(
void) {
1710 return BEAGLE_SUCCESS;
1720 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::rescalePartials(REALTYPE* destP,
1721 REALTYPE* scaleFactors,
1722 REALTYPE* cumulativeScaleFactors,
1723 const int fillWithOnes) {
1724 if (DEBUGGING_OUTPUT) {
1725 std::cerr <<
"destP (before rescale): \n";
1726 for(
int i=0; i<kPartialsSize; i++)
1727 fprintf(stderr,
"destP[%d] = %.5f\n",i,destP[i]);
1731 for (
int k = 0; k < kPatternCount; k++) {
1733 const int patternOffset = k * kPartialsPaddedStateCount;
1734 for (
int l = 0; l < kCategoryCount; l++) {
1735 int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1736 for (
int i = 0; i < kStateCount; i++) {
1737 if(destP[offset] > max)
1738 max = destP[offset];
1746 REALTYPE oneOverMax = REALTYPE(1.0) / max;
1747 for (
int l = 0; l < kCategoryCount; l++) {
1748 int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1749 for (
int i = 0; i < kStateCount; i++)
1750 destP[offset++] *= oneOverMax;
1753 if (kFlags & BEAGLE_FLAG_SCALERS_LOG) {
1754 REALTYPE logMax = log(max);
1755 scaleFactors[k] = logMax;
1756 if( cumulativeScaleFactors != NULL )
1757 cumulativeScaleFactors[k] += logMax;
1759 scaleFactors[k] = max;
1760 if( cumulativeScaleFactors != NULL )
1761 cumulativeScaleFactors[k] += log(max);
1764 if (DEBUGGING_OUTPUT) {
1765 for(
int i=0; i<kPatternCount; i++)
1766 fprintf(stderr,
"new scaleFactor[%d] = %.5f\n",i,scaleFactors[i]);
1771 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::autoRescalePartials(REALTYPE* destP,
1772 signed short* scaleFactors) {
1775 for (
int k = 0; k < kPatternCount; k++) {
1777 const int patternOffset = k * kPartialsPaddedStateCount;
1778 for (
int l = 0; l < kCategoryCount; l++) {
1779 int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1780 for (
int i = 0; i < kStateCount; i++) {
1781 if(destP[offset] > max)
1782 max = destP[offset];
1788 frexp(max, &expMax);
1789 scaleFactors[k] = expMax;
1792 for (
int l = 0; l < kCategoryCount; l++) {
1793 int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1794 for (
int i = 0; i < kStateCount; i++)
1795 destP[offset++] *= pow(2.0, -expMax);
1805 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP,
1807 const REALTYPE* matrices1,
1809 const REALTYPE* matrices2) {
1811 #pragma omp parallel for num_threads(kCategoryCount)
1812 for (
int l = 0; l < kCategoryCount; l++) {
1813 int v = l*kPartialsPaddedStateCount*kPatternCount;
1814 for (
int k = 0; k < kPatternCount; k++) {
1815 const int state1 = states1[k];
1816 const int state2 = states2[k];
1817 if (DEBUGGING_OUTPUT) {
1818 std::cerr <<
"calcStatesStates s1 = " << state1 <<
'\n';
1819 std::cerr <<
"calcStatesStates s2 = " << state2 <<
'\n';
1821 int w = l * kMatrixSize;
1822 for (
int i = 0; i < kStateCount; i++) {
1823 destP[v] = matrices1[w + state1] * matrices2[w + state2];
1826 w += kTransPaddedStateCount;
1834 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStatesFixedScaling(REALTYPE* destP,
1835 const int* child1States,
1836 const REALTYPE* child1TransMat,
1837 const int* child2States,
1838 const REALTYPE* child2TransMat,
1839 const REALTYPE* scaleFactors) {
1840 #pragma omp parallel for num_threads(kCategoryCount)
1841 for (
int l = 0; l < kCategoryCount; l++) {
1842 int v = l*kPartialsPaddedStateCount*kPatternCount;
1843 for (
int k = 0; k < kPatternCount; k++) {
1844 const int state1 = child1States[k];
1845 const int state2 = child2States[k];
1846 int w = l * kMatrixSize;
1847 REALTYPE scaleFactor = scaleFactors[k];
1848 for (
int i = 0; i < kStateCount; i++) {
1849 destP[v] = child1TransMat[w + state1] *
1850 child2TransMat[w + state2] / scaleFactor;
1853 w += kTransPaddedStateCount;
1864 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartials(REALTYPE* destP,
1866 const REALTYPE* matrices1,
1867 const REALTYPE* partials2,
1868 const REALTYPE* matrices2) {
1869 int matrixIncr = kStateCount;
1872 matrixIncr += T_PAD;
1875 int stateCountModFour = (kStateCount / 4) * 4;
1877 #pragma omp parallel for num_threads(kCategoryCount)
1878 for (
int l = 0; l < kCategoryCount; l++) {
1879 int v = l*kPartialsPaddedStateCount*kPatternCount;
1880 int matrixOffset = l*kMatrixSize;
1881 const REALTYPE* partials2Ptr = &partials2[v];
1882 REALTYPE* destPtr = &destP[v];
1883 for (
int k = 0; k < kPatternCount; k++) {
1884 int w = l * kMatrixSize;
1885 int state1 = states1[k];
1886 for (
int i = 0; i < kStateCount; i++) {
1887 const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr;
1888 REALTYPE tmp = matrices1[w + state1];
1889 REALTYPE sumA = 0.0;
1890 REALTYPE sumB = 0.0;
1892 for (; j < stateCountModFour; j += 4) {
1893 sumA += matrices2Ptr[j + 0] * partials2Ptr[j + 0];
1894 sumB += matrices2Ptr[j + 1] * partials2Ptr[j + 1];
1895 sumA += matrices2Ptr[j + 2] * partials2Ptr[j + 2];
1896 sumB += matrices2Ptr[j + 3] * partials2Ptr[j + 3];
1898 for (; j < kStateCount; j++) {
1899 sumA += matrices2Ptr[j] * partials2Ptr[j];
1904 *(destPtr++) = tmp * (sumA + sumB);
1907 partials2Ptr += kPartialsPaddedStateCount;
1913 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartialsFixedScaling(REALTYPE* destP,
1915 const REALTYPE* matrices1,
1916 const REALTYPE* partials2,
1917 const REALTYPE* matrices2,
1918 const REALTYPE* scaleFactors) {
1919 int matrixIncr = kStateCount;
1922 matrixIncr += T_PAD;
1924 int stateCountModFour = (kStateCount / 4) * 4;
1926 #pragma omp parallel for num_threads(kCategoryCount)
1927 for (
int l = 0; l < kCategoryCount; l++) {
1928 int v = l*kPartialsPaddedStateCount*kPatternCount;
1929 int matrixOffset = l*kMatrixSize;
1930 const REALTYPE* partials2Ptr = &partials2[v];
1931 REALTYPE* destPtr = &destP[v];
1932 for (
int k = 0; k < kPatternCount; k++) {
1933 int w = l * kMatrixSize;
1934 int state1 = states1[k];
1935 REALTYPE oneOverScaleFactor = REALTYPE(1.0) / scaleFactors[k];
1936 for (
int i = 0; i < kStateCount; i++) {
1937 const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr;
1938 REALTYPE tmp = matrices1[w + state1];
1939 REALTYPE sumA = 0.0;
1940 REALTYPE sumB = 0.0;
1942 for (; j < stateCountModFour; j += 4) {
1943 sumA += matrices2Ptr[j + 0] * partials2Ptr[j + 0];
1944 sumB += matrices2Ptr[j + 1] * partials2Ptr[j + 1];
1945 sumA += matrices2Ptr[j + 2] * partials2Ptr[j + 2];
1946 sumB += matrices2Ptr[j + 3] * partials2Ptr[j + 3];
1948 for (; j < kStateCount; j++) {
1949 sumA += matrices2Ptr[j] * partials2Ptr[j];
1954 *(destPtr++) = tmp * (sumA + sumB) * oneOverScaleFactor;
1957 partials2Ptr += kPartialsPaddedStateCount;
1966 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartials(REALTYPE* destP,
1967 const REALTYPE* partials1,
1968 const REALTYPE* matrices1,
1969 const REALTYPE* partials2,
1970 const REALTYPE* matrices2) {
1971 int matrixIncr = kStateCount;
1974 matrixIncr += T_PAD;
1976 int stateCountModFour = (kStateCount / 4) * 4;
1978 #pragma omp parallel for num_threads(kCategoryCount)
1979 for (
int l = 0; l < kCategoryCount; l++) {
1980 int v = l*kPartialsPaddedStateCount*kPatternCount;
1981 int matrixOffset = l*kMatrixSize;
1982 const REALTYPE* partials1Ptr = &partials1[v];
1983 const REALTYPE* partials2Ptr = &partials2[v];
1984 REALTYPE* destPtr = &destP[v];
1985 for (
int k = 0; k < kPatternCount; k++) {
1987 for (
int i = 0; i < kStateCount; i++) {
1988 const REALTYPE* matrices1Ptr = matrices1 + matrixOffset + i * matrixIncr;
1989 const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr;
1990 REALTYPE sum1A = 0.0, sum2A = 0.0;
1991 REALTYPE sum1B = 0.0, sum2B = 0.0;
1993 for (; j < stateCountModFour; j += 4) {
1994 sum1A += matrices1Ptr[j + 0] * partials1Ptr[j + 0];
1995 sum2A += matrices2Ptr[j + 0] * partials2Ptr[j + 0];
1997 sum1B += matrices1Ptr[j + 1] * partials1Ptr[j + 1];
1998 sum2B += matrices2Ptr[j + 1] * partials2Ptr[j + 1];
2000 sum1A += matrices1Ptr[j + 2] * partials1Ptr[j + 2];
2001 sum2A += matrices2Ptr[j + 2] * partials2Ptr[j + 2];
2003 sum1B += matrices1Ptr[j + 3] * partials1Ptr[j + 3];
2004 sum2B += matrices2Ptr[j + 3] * partials2Ptr[j + 3];
2007 for (; j < kStateCount; j++) {
2008 sum1A += matrices1Ptr[j] * partials1Ptr[j];
2009 sum2A += matrices2Ptr[j] * partials2Ptr[j];
2012 *(destPtr++) = (sum1A + sum1B) * (sum2A + sum2B);
2015 partials1Ptr += kPartialsPaddedStateCount;
2016 partials2Ptr += kPartialsPaddedStateCount;
2022 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsFixedScaling(REALTYPE* destP,
2023 const REALTYPE* partials1,
2024 const REALTYPE* matrices1,
2025 const REALTYPE* partials2,
2026 const REALTYPE* matrices2,
2027 const REALTYPE* scaleFactors) {
2028 int matrixIncr = kStateCount;
2031 matrixIncr += T_PAD;
2033 int stateCountModFour = (kStateCount / 4) * 4;
2035 #pragma omp parallel for num_threads(kCategoryCount)
2036 for (
int l = 0; l < kCategoryCount; l++) {
2037 int v = l*kPartialsPaddedStateCount*kPatternCount;
2038 int matrixOffset = l*kMatrixSize;
2039 const REALTYPE* partials1Ptr = &partials1[v];
2040 const REALTYPE* partials2Ptr = &partials2[v];
2041 REALTYPE* destPtr = &destP[v];
2042 for (
int k = 0; k < kPatternCount; k++) {
2043 REALTYPE oneOverScaleFactor = REALTYPE(1.0) / scaleFactors[k];
2044 for (
int i = 0; i < kStateCount; i++) {
2045 const REALTYPE* matrices1Ptr = matrices1 + matrixOffset + i * matrixIncr;
2046 const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr;
2047 REALTYPE sum1A = 0.0, sum2A = 0.0;
2048 REALTYPE sum1B = 0.0, sum2B = 0.0;
2050 for (; j < stateCountModFour; j += 4) {
2051 sum1A += matrices1Ptr[j + 0] * partials1Ptr[j + 0];
2052 sum2A += matrices2Ptr[j + 0] * partials2Ptr[j + 0];
2054 sum1B += matrices1Ptr[j + 1] * partials1Ptr[j + 1];
2055 sum2B += matrices2Ptr[j + 1] * partials2Ptr[j + 1];
2057 sum1A += matrices1Ptr[j + 2] * partials1Ptr[j + 2];
2058 sum2A += matrices2Ptr[j + 2] * partials2Ptr[j + 2];
2060 sum1B += matrices1Ptr[j + 3] * partials1Ptr[j + 3];
2061 sum2B += matrices2Ptr[j + 3] * partials2Ptr[j + 3];
2064 for (; j < kStateCount; j++) {
2065 sum1A += matrices1Ptr[j] * partials1Ptr[j];
2066 sum2A += matrices2Ptr[j] * partials2Ptr[j];
2069 *(destPtr++) = (sum1A + sum1B) * (sum2A + sum2B) * oneOverScaleFactor;
2072 partials1Ptr += kPartialsPaddedStateCount;
2073 partials2Ptr += kPartialsPaddedStateCount;
2079 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsAutoScaling(REALTYPE* destP,
2080 const REALTYPE* partials1,
2081 const REALTYPE* matrices1,
2082 const REALTYPE* partials2,
2083 const REALTYPE* matrices2,
2084 int* activateScaling) {
2086 #pragma omp parallel for num_threads(kCategoryCount)
2087 for (
int l = 0; l < kCategoryCount; l++) {
2088 int u = l*kPartialsPaddedStateCount*kPatternCount;
2089 int v = l*kPartialsPaddedStateCount*kPatternCount;
2090 for (
int k = 0; k < kPatternCount; k++) {
2091 int w = l * kMatrixSize;
2092 for (
int i = 0; i < kStateCount; i++) {
2093 REALTYPE sum1 = 0.0, sum2 = 0.0;
2094 for (
int j = 0; j < kStateCount; j++) {
2095 sum1 += matrices1[w] * partials1[v + j];
2096 sum2 += matrices2[w] * partials2[v + j];
2103 destP[u] = sum1 * sum2;
2105 if (*activateScaling == 0) {
2107 frexp(destP[u], &expTmp);
2108 if (abs(expTmp) > scalingExponentThreshhold)
2109 *activateScaling = 1;
2115 v += kPartialsPaddedStateCount;
2121 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getPaddedPatternsModulus() {
2127 void* BeagleCPUImpl<BEAGLE_CPU_GENERIC>::mallocAligned(
size_t size) {
2128 void *ptr = (
void *) NULL;
2130 #if defined (__APPLE__) || defined(WIN32)
2138 if(ptr == (
void*)NULL) {
2143 const size_t align = 32;
2145 const size_t align = 16;
2148 res = posix_memalign(&ptr, align, size);
2159 BEAGLE_CPU_FACTORY_TEMPLATE
2160 BeagleImpl* BeagleCPUImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(
int tipCount,
2161 int partialsBufferCount,
2162 int compactBufferCount,
2165 int eigenBufferCount,
2166 int matrixBufferCount,
2168 int scaleBufferCount,
2170 long preferenceFlags,
2171 long requirementFlags,
2174 BeagleImpl* impl =
new BeagleCPUImpl<REALTYPE, T_PAD_DEFAULT, P_PAD_DEFAULT>();
2178 impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
2179 patternCount, eigenBufferCount, matrixBufferCount,
2180 categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags);
2181 if (*errorCode == BEAGLE_SUCCESS) {
2188 if (DEBUGGING_OUTPUT)
2189 std::cerr <<
"exception in initialize\n";
2200 BEAGLE_CPU_FACTORY_TEMPLATE
2201 const char* BeagleCPUImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() {
2202 return getBeagleCPUName<BEAGLE_CPU_FACTORY_GENERIC>();
2205 BEAGLE_CPU_FACTORY_TEMPLATE
2206 const long BeagleCPUImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getFlags() {
2207 long flags = BEAGLE_FLAG_COMPUTATION_SYNCH |
2208 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | BEAGLE_FLAG_SCALING_DYNAMIC |
2209 BEAGLE_FLAG_THREADING_NONE |
2210 BEAGLE_FLAG_PROCESSOR_CPU |
2211 BEAGLE_FLAG_VECTOR_NONE |
2212 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
2213 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
2214 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
2215 if (DOUBLE_PRECISION)
2216 flags |= BEAGLE_FLAG_PRECISION_DOUBLE;
2218 flags |= BEAGLE_FLAG_PRECISION_SINGLE;
2225 #endif // BEAGLE_CPU_IMPL_GENERAL_HPP