HMSBEAGLE  1.0.0
BeagleCPUImpl.hpp
1 /*
2  * BeagleCPUImpl.cpp
3  * BEAGLE
4  *
5  * Copyright 2009 Phylogenetic Likelihood Working Group
6  *
7  * This file is part of BEAGLE.
8  *
9  * BEAGLE is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser General Public License as
11  * published by the Free Software Foundation, either version 3 of
12  * the License, or (at your option) any later version.
13  *
14  * BEAGLE is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with BEAGLE. If not, see
21  * <http://www.gnu.org/licenses/>.
22  *
23  * @author Andrew Rambaut
24  * @author Marc Suchard
25  * @author Daniel Ayres
26  * @author Mark Holder
27  */
28 
30 // so that we can flag them. This would this would be helpful for
31 // implementing:
32 // 1. an error-checking version that double-checks (to the extent
33 // possible) that the client is using the API correctly. This would
34 // ideally be a conditional compilation variant (so that we do
35 // not normally incur runtime penalties, but can enable it to help
36 // find bugs).
37 // 2. a multithreading impl that checks dependencies before queuing
38 // partials.
39 
41 // would be trivial for this impl, and would be easier for clients that want
42 // to cache partial like calculations for a indeterminate number of trees.
44 // void waitForPartials(int* instance;
45 // int instanceCount;
46 // int* parentPartialIndex;
47 // int partialCount;
48 // );
49 // method that blocks until the partials are valid would be important for
50 // clients (such as GARLI) that deal with big trees by overwriting some temporaries.
52 // but MTH did want to record it for posterity). We could add following
53 // calls:
55 // BeagleReturnCodes swapEigens(int instance, int *firstInd, int *secondInd, int count);
56 // BeagleReturnCodes swapTransitionMatrices(int instance, int *firstInd, int *secondInd, int count);
57 // BeagleReturnCodes swapPartials(int instance, int *firstInd, int *secondInd, int count);
59 // They would be optional for the client but could improve efficiency if:
60 // 1. The impl is load balancing, AND
61 // 2. The client code, uses the calls to synchronize the indexing of temporaries
62 // between instances such that you can pass an instanceIndices list with
63 // multiple entries to updatePartials.
64 // These seem too nitty gritty and low-level, but they also make it easy to
65 // write a wrapper geared toward MCMC (during a move, cache the old data
66 // in an unused array, after a rejection swap back to the cached copy)
67 
68 #ifndef BEAGLE_CPU_IMPL_GENERAL_HPP
69 #define BEAGLE_CPU_IMPL_GENERAL_HPP
70 
71 #ifdef HAVE_CONFIG_H
72 #include "libhmsbeagle/config.h"
73 #endif
74 
75 #include <cstdio>
76 #include <cstdlib>
77 #include <iostream>
78 #include <cstring>
79 #include <cmath>
80 #include <cassert>
81 #include <vector>
82 #include <cfloat>
83 
84 #include "libhmsbeagle/beagle.h"
85 #include "libhmsbeagle/CPU/Precision.h"
86 #include "libhmsbeagle/CPU/BeagleCPUImpl.h"
87 #include "libhmsbeagle/CPU/EigenDecompositionCube.h"
88 #include "libhmsbeagle/CPU/EigenDecompositionSquare.h"
89 
90 namespace beagle {
91 namespace cpu {
92 
93 //#if defined (BEAGLE_IMPL_DEBUGGING_OUTPUT) && BEAGLE_IMPL_DEBUGGING_OUTPUT
94 //const bool DEBUGGING_OUTPUT = true;
95 //#else
96 //const bool DEBUGGING_OUTPUT = false;
97 //#endif
98 
99 BEAGLE_CPU_FACTORY_TEMPLATE
100 inline const char* getBeagleCPUName(){ return "CPU-Unknown"; };
101 
102 template<>
103 inline const char* getBeagleCPUName<double>(){ return "CPU-Double"; };
104 
105 template<>
106 inline const char* getBeagleCPUName<float>(){ return "CPU-Single"; };
107 
108 BEAGLE_CPU_FACTORY_TEMPLATE
109 inline const long getBeagleCPUFlags(){ return BEAGLE_FLAG_COMPUTATION_SYNCH; };
110 
111 template<>
112 inline const long getBeagleCPUFlags<double>(){ return BEAGLE_FLAG_COMPUTATION_SYNCH |
113  BEAGLE_FLAG_THREADING_NONE |
114  BEAGLE_FLAG_PROCESSOR_CPU |
115  BEAGLE_FLAG_PRECISION_DOUBLE |
116  BEAGLE_FLAG_VECTOR_NONE; };
117 
118 template<>
119 inline const long getBeagleCPUFlags<float>(){ return BEAGLE_FLAG_COMPUTATION_SYNCH |
120  BEAGLE_FLAG_THREADING_NONE |
121  BEAGLE_FLAG_PROCESSOR_CPU |
122  BEAGLE_FLAG_PRECISION_SINGLE |
123  BEAGLE_FLAG_VECTOR_NONE; };
124 
125 
126 
127 BEAGLE_CPU_TEMPLATE
128 BeagleCPUImpl<BEAGLE_CPU_GENERIC>::~BeagleCPUImpl() {
129  // free all that stuff...
130  // If you delete partials, make sure not to delete the last element
131  // which is TEMP_SCRATCH_PARTIAL twice.
132 
133  for(unsigned int i=0; i<kEigenDecompCount; i++) {
134  if (gCategoryWeights[i] != NULL)
135  free(gCategoryWeights[i]);
136  if (gStateFrequencies[i] != NULL)
137  free(gStateFrequencies[i]);
138  }
139 
140  for(unsigned int i=0; i<kMatrixCount; i++) {
141  if (gTransitionMatrices[i] != NULL)
142  free(gTransitionMatrices[i]);
143  }
144  free(gTransitionMatrices);
145 
146  for(unsigned int i=0; i<kBufferCount; i++) {
147  if (gPartials[i] != NULL)
148  free(gPartials[i]);
149  if (gTipStates[i] != NULL)
150  free(gTipStates[i]);
151  }
152  free(gPartials);
153  free(gTipStates);
154 
155  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
156  for(unsigned int i=0; i<kScaleBufferCount; i++) {
157  if (gAutoScaleBuffers[i] != NULL)
158  free(gAutoScaleBuffers[i]);
159  }
160  if (gAutoScaleBuffers)
161  free(gAutoScaleBuffers);
162  free(gActiveScalingFactors);
163  if (gScaleBuffers[0] != NULL)
164  free(gScaleBuffers[0]);
165  } else {
166  for(unsigned int i=0; i<kScaleBufferCount; i++) {
167  if (gScaleBuffers[i] != NULL)
168  free(gScaleBuffers[i]);
169  }
170  }
171 
172  if (gScaleBuffers)
173  free(gScaleBuffers);
174 
175  free(gCategoryRates);
176  free(gPatternWeights);
177 
178  free(integrationTmp);
179  free(firstDerivTmp);
180  free(secondDerivTmp);
181 
182  free(outLogLikelihoodsTmp);
183  free(outFirstDerivativesTmp);
184  free(outSecondDerivativesTmp);
185 
186  free(ones);
187  free(zeros);
188 
189  delete gEigenDecomposition;
190 }
191 
192 BEAGLE_CPU_TEMPLATE
193 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::createInstance(int tipCount,
194  int partialsBufferCount,
195  int compactBufferCount,
196  int stateCount,
197  int patternCount,
198  int eigenDecompositionCount,
199  int matrixCount,
200  int categoryCount,
201  int scaleBufferCount,
202  int resourceNumber,
203  long preferenceFlags,
204  long requirementFlags) {
205  if (DEBUGGING_OUTPUT)
206  std::cerr << "in BeagleCPUImpl::initialize\n" ;
207 
208  if (DOUBLE_PRECISION) {
209  realtypeMin = DBL_MIN;
210  scalingExponentThreshhold = 200;
211  } else {
212  realtypeMin = FLT_MIN;
213  scalingExponentThreshhold = 20;
214  }
215 
216  kBufferCount = partialsBufferCount + compactBufferCount;
217  kTipCount = tipCount;
218  assert(kBufferCount > kTipCount);
219  kStateCount = stateCount;
220  kPatternCount = patternCount;
221 
222  kInternalPartialsBufferCount = kBufferCount - kTipCount;
223 
224  kTransPaddedStateCount = kStateCount + T_PAD;
225  kPartialsPaddedStateCount = kStateCount + P_PAD;
226 
227  // Handle possible padding of pattern sites for vectorization
228  int modulus = getPaddedPatternsModulus();
229  kPaddedPatternCount = kPatternCount;
230  int remainder = kPatternCount % modulus;
231  if (remainder != 0) {
232  kPaddedPatternCount += modulus - remainder;
233  }
234  kExtraPatterns = kPaddedPatternCount - kPatternCount;
235 
236  kMatrixCount = matrixCount;
237  kEigenDecompCount = eigenDecompositionCount;
238  kCategoryCount = categoryCount;
239  kScaleBufferCount = scaleBufferCount;
240 
241  kMatrixSize = (T_PAD + kStateCount) * kStateCount;
242 
243  int scaleBufferSize = kPaddedPatternCount;
244 
245  kFlags = 0;
246 
247  if (preferenceFlags & BEAGLE_FLAG_SCALING_AUTO || requirementFlags & BEAGLE_FLAG_SCALING_AUTO) {
248  kFlags |= BEAGLE_FLAG_SCALING_AUTO;
249  kFlags |= BEAGLE_FLAG_SCALERS_LOG;
250  kScaleBufferCount = kInternalPartialsBufferCount;
251  } else if (preferenceFlags & BEAGLE_FLAG_SCALING_ALWAYS || requirementFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
252  kFlags |= BEAGLE_FLAG_SCALING_ALWAYS;
253  kFlags |= BEAGLE_FLAG_SCALERS_LOG;
254  kScaleBufferCount = kInternalPartialsBufferCount + 1; // +1 for temp buffer used by edgelikelihood
255  } else if (preferenceFlags & BEAGLE_FLAG_SCALING_DYNAMIC || requirementFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
256  kFlags |= BEAGLE_FLAG_SCALING_DYNAMIC;
257  kFlags |= BEAGLE_FLAG_SCALERS_RAW;
258  } else if (preferenceFlags & BEAGLE_FLAG_SCALERS_LOG || requirementFlags & BEAGLE_FLAG_SCALERS_LOG) {
259  kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
260  kFlags |= BEAGLE_FLAG_SCALERS_LOG;
261  } else {
262  kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
263  kFlags |= BEAGLE_FLAG_SCALERS_RAW;
264  }
265 
266  if (requirementFlags & BEAGLE_FLAG_EIGEN_COMPLEX || preferenceFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
267  kFlags |= BEAGLE_FLAG_EIGEN_COMPLEX;
268  else
269  kFlags |= BEAGLE_FLAG_EIGEN_REAL;
270 
271  if (requirementFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED || preferenceFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED)
272  kFlags |= BEAGLE_FLAG_INVEVEC_TRANSPOSED;
273  else
274  kFlags |= BEAGLE_FLAG_INVEVEC_STANDARD;
275 
276  if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
277  gEigenDecomposition = new EigenDecompositionSquare<BEAGLE_CPU_EIGEN_GENERIC>(kEigenDecompCount,
278  kStateCount,kCategoryCount,kFlags);
279  else
280  gEigenDecomposition = new EigenDecompositionCube<BEAGLE_CPU_EIGEN_GENERIC>(kEigenDecompCount,
281  kStateCount, kCategoryCount,kFlags);
282 
283  gCategoryRates = (double*) malloc(sizeof(double) * kCategoryCount);
284  if (gCategoryRates == NULL)
285  throw std::bad_alloc();
286 
287  gPatternWeights = (double*) malloc(sizeof(double) * kPatternCount);
288  if (gPatternWeights == NULL)
289  throw std::bad_alloc();
290 
291  // TODO: if pattern padding is implemented this will create problems with setTipPartials
292  kPartialsSize = kPaddedPatternCount * kPartialsPaddedStateCount * kCategoryCount;
293 
294  gPartials = (REALTYPE**) malloc(sizeof(REALTYPE*) * kBufferCount);
295  if (gPartials == NULL)
296  throw std::bad_alloc();
297 
298  gStateFrequencies = (REALTYPE**) calloc(sizeof(REALTYPE*), kEigenDecompCount);
299  if (gStateFrequencies == NULL)
300  throw std::bad_alloc();
301 
302  gCategoryWeights = (REALTYPE**) calloc(sizeof(REALTYPE*), kEigenDecompCount);
303  if (gCategoryWeights == NULL)
304  throw std::bad_alloc();
305 
306  // assigning kBufferCount to this array so that we can just check if a tipStateBuffer is
307  // allocated
308  gTipStates = (int**) malloc(sizeof(int*) * kBufferCount);
309  if (gTipStates == NULL)
310  throw std::bad_alloc();
311 
312  for (int i = 0; i < kBufferCount; i++) {
313  gPartials[i] = NULL;
314  gTipStates[i] = NULL;
315  }
316 
317  for (int i = kTipCount; i < kBufferCount; i++) {
318  gPartials[i] = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPartialsSize);
319  if (gPartials[i] == NULL)
320  throw std::bad_alloc();
321  }
322 
323  gScaleBuffers = NULL;
324 
325  gAutoScaleBuffers = NULL;
326 
327  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
328  gAutoScaleBuffers = (signed short**) malloc(sizeof(signed short*) * kScaleBufferCount);
329  if (gAutoScaleBuffers == NULL)
330  throw std::bad_alloc();
331  for (int i = 0; i < kScaleBufferCount; i++) {
332  gAutoScaleBuffers[i] = (signed short*) malloc(sizeof(signed short) * scaleBufferSize);
333  if (gAutoScaleBuffers[i] == 0L)
334  throw std::bad_alloc();
335  }
336  gActiveScalingFactors = (int*) malloc(sizeof(int) * kInternalPartialsBufferCount);
337  gScaleBuffers = (REALTYPE**) malloc(sizeof(REALTYPE*));
338  gScaleBuffers[0] = (REALTYPE*) malloc(sizeof(REALTYPE) * scaleBufferSize);
339  } else {
340  gScaleBuffers = (REALTYPE**) malloc(sizeof(REALTYPE*) * kScaleBufferCount);
341  if (gScaleBuffers == NULL)
342  throw std::bad_alloc();
343 
344  for (int i = 0; i < kScaleBufferCount; i++) {
345  gScaleBuffers[i] = (REALTYPE*) malloc(sizeof(REALTYPE) * scaleBufferSize);
346 
347  if (gScaleBuffers[i] == 0L)
348  throw std::bad_alloc();
349 
350 
351  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
352  for (int j=0; j < scaleBufferSize; j++) {
353  gScaleBuffers[i][j] = 1.0;
354  }
355  }
356  }
357  }
358 
359 
360  gTransitionMatrices = (REALTYPE**) malloc(sizeof(REALTYPE*) * kMatrixCount);
361  if (gTransitionMatrices == NULL)
362  throw std::bad_alloc();
363  for (int i = 0; i < kMatrixCount; i++) {
364  gTransitionMatrices[i] = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kMatrixSize * kCategoryCount);
365  if (gTransitionMatrices[i] == 0L)
366  throw std::bad_alloc();
367  }
368 
369  integrationTmp = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPatternCount * kStateCount);
370  firstDerivTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
371  secondDerivTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
372 
373  outLogLikelihoodsTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
374  outFirstDerivativesTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
375  outSecondDerivativesTmp = (REALTYPE*) malloc(sizeof(REALTYPE) * kPatternCount * kStateCount);
376 
377  zeros = (REALTYPE*) malloc(sizeof(REALTYPE) * kPaddedPatternCount);
378  ones = (REALTYPE*) malloc(sizeof(REALTYPE) * kPaddedPatternCount);
379  for(int i = 0; i < kPaddedPatternCount; i++) {
380  zeros[i] = 0.0;
381  ones[i] = 1.0;
382  }
383 
384  return BEAGLE_SUCCESS;
385 }
386 
387 BEAGLE_CPU_TEMPLATE
388 const char* BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getName() {
389  return getBeagleCPUName<BEAGLE_CPU_FACTORY_GENERIC>();
390 }
391 
392 BEAGLE_CPU_TEMPLATE
393 const long BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getFlags() {
394  return getBeagleCPUFlags<BEAGLE_CPU_FACTORY_GENERIC>();
395 }
396 
397 BEAGLE_CPU_TEMPLATE
398 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getInstanceDetails(BeagleInstanceDetails* returnInfo) {
399  if (returnInfo != NULL) {
400  returnInfo->resourceNumber = 0;
401  returnInfo->flags = getFlags();
402  returnInfo->flags |= kFlags;
403 
404  returnInfo->implName = (char*) getName();
405  }
406 
407  return BEAGLE_SUCCESS;
408 }
409 
410 BEAGLE_CPU_TEMPLATE
411 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTipStates(int tipIndex,
412  const int* inStates) {
413  if (tipIndex < 0 || tipIndex >= kTipCount)
414  return BEAGLE_ERROR_OUT_OF_RANGE;
415  gTipStates[tipIndex] = (int*) mallocAligned(sizeof(int) * kPaddedPatternCount);
416  // TODO: What if this throws a memory full error?
417  for (int j = 0; j < kPatternCount; j++) {
418  gTipStates[tipIndex][j] = (inStates[j] < kStateCount ? inStates[j] : kStateCount);
419  }
420  for (int j = kPatternCount; j < kPaddedPatternCount; j++) {
421  gTipStates[tipIndex][j] = kStateCount;
422  }
423 
424  return BEAGLE_SUCCESS;
425 }
426 
427 BEAGLE_CPU_TEMPLATE
428 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTipPartials(int tipIndex,
429  const double* inPartials) {
430  if (tipIndex < 0 || tipIndex >= kTipCount)
431  return BEAGLE_ERROR_OUT_OF_RANGE;
432  if(gPartials[tipIndex] == NULL) {
433  gPartials[tipIndex] = (REALTYPE*) mallocAligned(sizeof(REALTYPE) * kPartialsSize);
434  // TODO: What if this throws a memory full error?
435  if (gPartials[tipIndex] == 0L)
436  return BEAGLE_ERROR_OUT_OF_MEMORY;
437  }
438 
439  const double* inPartialsOffset;
440  REALTYPE* tmpRealPartialsOffset = gPartials[tipIndex];
441  for (int l = 0; l < kCategoryCount; l++) {
442  inPartialsOffset = inPartials;
443  for (int i = 0; i < kPatternCount; i++) {
444  beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
445  tmpRealPartialsOffset += kPartialsPaddedStateCount;
446  inPartialsOffset += kStateCount;
447  }
448  // Pad extra buffer with zeros
449  for(int k = 0; k < kPartialsPaddedStateCount * (kPaddedPatternCount - kPatternCount); k++) {
450  *tmpRealPartialsOffset++ = 0;
451  }
452  }
453 
454  return BEAGLE_SUCCESS;
455 }
456 
457 BEAGLE_CPU_TEMPLATE
458 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setPartials(int bufferIndex,
459  const double* inPartials) {
460  if (bufferIndex < 0 || bufferIndex >= kBufferCount)
461  return BEAGLE_ERROR_OUT_OF_RANGE;
462  if (gPartials[bufferIndex] == NULL) {
463  gPartials[bufferIndex] = (REALTYPE*) malloc(sizeof(REALTYPE) * kPartialsSize);
464  if (gPartials[bufferIndex] == 0L)
465  return BEAGLE_ERROR_OUT_OF_MEMORY;
466  }
467 
468  const double* inPartialsOffset = inPartials;
469  REALTYPE* tmpRealPartialsOffset = gPartials[bufferIndex];
470  for (int l = 0; l < kCategoryCount; l++) {
471  for (int i = 0; i < kPatternCount; i++) {
472  beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
473  tmpRealPartialsOffset += kPartialsPaddedStateCount;
474  inPartialsOffset += kStateCount;
475  }
476  // Pad extra buffer with zeros
477  for(int k = 0; k < kPartialsPaddedStateCount * (kPaddedPatternCount - kPatternCount); k++) {
478  *tmpRealPartialsOffset++ = 0;
479  }
480  }
481 
482  return BEAGLE_SUCCESS;
483 }
484 
485 BEAGLE_CPU_TEMPLATE
486 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getPartials(int bufferIndex,
487  int cumulativeScaleIndex,
488  double* outPartials) {
489  // TODO: Make this work with partials padding
490 
491  // TODO: Test with and without padding
492  if (bufferIndex < 0 || bufferIndex >= kBufferCount)
493  return BEAGLE_ERROR_OUT_OF_RANGE;
494 
495  if (kPatternCount == kPaddedPatternCount) {
496  beagleMemCpy(outPartials, gPartials[bufferIndex], kPartialsSize);
497  } else { // Need to remove padding
498  double *offsetOutPartials;
499  REALTYPE* offsetBeaglePartials = gPartials[bufferIndex];
500  for(int i = 0; i < kCategoryCount; i++) {
501  beagleMemCpy(offsetOutPartials,offsetBeaglePartials,
502  kPatternCount * kStateCount);
503  offsetOutPartials += kPatternCount * kStateCount;
504  offsetBeaglePartials += kPaddedPatternCount * kStateCount;
505  }
506  }
507 
508  if (cumulativeScaleIndex != BEAGLE_OP_NONE) {
509  REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScaleIndex];
510  int index = 0;
511  for(int k=0; k<kPatternCount; k++) {
512  REALTYPE scaleFactor = exp(cumulativeScaleBuffer[k]);
513  for(int i=0; i<kStateCount; i++) {
514  outPartials[index] *= scaleFactor;
515  index++;
516  }
517  }
518  // TODO: Do we assume the cumulativeScaleBuffer is on the log-scale?
519  }
520 
521  return BEAGLE_SUCCESS;
522 }
523 
524 BEAGLE_CPU_TEMPLATE
525 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setEigenDecomposition(int eigenIndex,
526  const double* inEigenVectors,
527  const double* inInverseEigenVectors,
528  const double* inEigenValues) {
529 
530  gEigenDecomposition->setEigenDecomposition(eigenIndex, inEigenVectors, inInverseEigenVectors, inEigenValues);
531  return BEAGLE_SUCCESS;
532 }
533 
534 BEAGLE_CPU_TEMPLATE
535 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setCategoryRates(const double* inCategoryRates) {
536  memcpy(gCategoryRates, inCategoryRates, sizeof(double) * kCategoryCount);
537  return BEAGLE_SUCCESS;
538 }
539 
540 BEAGLE_CPU_TEMPLATE
541 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setPatternWeights(const double* inPatternWeights) {
542  assert(inPatternWeights != 0L);
543  memcpy(gPatternWeights, inPatternWeights, sizeof(double) * kPatternCount);
544  return BEAGLE_SUCCESS;
545 }
546 
547 BEAGLE_CPU_TEMPLATE
548  int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setStateFrequencies(int stateFrequenciesIndex,
549  const double* inStateFrequencies) {
550  if (stateFrequenciesIndex < 0 || stateFrequenciesIndex >= kEigenDecompCount)
551  return BEAGLE_ERROR_OUT_OF_RANGE;
552  if (gStateFrequencies[stateFrequenciesIndex] == NULL) {
553  gStateFrequencies[stateFrequenciesIndex] = (REALTYPE*) malloc(sizeof(REALTYPE) * kStateCount);
554  if (gStateFrequencies[stateFrequenciesIndex] == 0L)
555  return BEAGLE_ERROR_OUT_OF_MEMORY;
556  }
557  beagleMemCpy(gStateFrequencies[stateFrequenciesIndex], inStateFrequencies, kStateCount);
558 
559  return BEAGLE_SUCCESS;
560 }
561 
562 BEAGLE_CPU_TEMPLATE
563 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setCategoryWeights(int categoryWeightsIndex,
564  const double* inCategoryWeights) {
565  if (categoryWeightsIndex < 0 || categoryWeightsIndex >= kEigenDecompCount)
566  return BEAGLE_ERROR_OUT_OF_RANGE;
567  if (gCategoryWeights[categoryWeightsIndex] == NULL) {
568  gCategoryWeights[categoryWeightsIndex] = (REALTYPE*) malloc(sizeof(REALTYPE) * kCategoryCount);
569  if (gCategoryWeights[categoryWeightsIndex] == 0L)
570  return BEAGLE_ERROR_OUT_OF_MEMORY;
571  }
572  beagleMemCpy(gCategoryWeights[categoryWeightsIndex], inCategoryWeights, kCategoryCount);
573 
574  return BEAGLE_SUCCESS;
575 }
576 
577 BEAGLE_CPU_TEMPLATE
578 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getTransitionMatrix(int matrixIndex,
579  double* outMatrix) {
580  // TODO Test with multiple rate categories
581 if (T_PAD != 0) {
582  double* offsetOutMatrix = outMatrix;
583  REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex];
584  for(int i = 0; i < kCategoryCount; i++) {
585  for(int j = 0; j < kStateCount; j++) {
586  beagleMemCpy(offsetOutMatrix,offsetBeagleMatrix,kStateCount);
587  offsetBeagleMatrix += kTransPaddedStateCount; // Skip padding
588  offsetOutMatrix += kStateCount;
589  }
590  }
591 } else {
592  beagleMemCpy(outMatrix,gTransitionMatrices[matrixIndex],
593  kMatrixSize * kCategoryCount);
594 }
595  return BEAGLE_SUCCESS;
596 }
597 
598 BEAGLE_CPU_TEMPLATE
599 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getSiteLogLikelihoods(double* outLogLikelihoods) {
600  beagleMemCpy(outLogLikelihoods, outLogLikelihoodsTmp, kPatternCount);
601 
602  return BEAGLE_SUCCESS;
603 }
604 
605 BEAGLE_CPU_TEMPLATE
606 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getSiteDerivatives(double* outFirstDerivatives,
607  double* outSecondDerivatives) {
608  beagleMemCpy(outFirstDerivatives, outFirstDerivativesTmp, kPatternCount);
609  if (outSecondDerivatives != NULL)
610  beagleMemCpy(outSecondDerivatives, outSecondDerivativesTmp, kPatternCount);
611 
612  return BEAGLE_SUCCESS;
613 }
614 
615 
616 BEAGLE_CPU_TEMPLATE
617 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTransitionMatrix(int matrixIndex,
618  const double* inMatrix,
619  double paddedValue) {
620 
621 if (T_PAD != 0) {
622  const double* offsetInMatrix = inMatrix;
623  REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex];
624  for(int i = 0; i < kCategoryCount; i++) {
625  for(int j = 0; j < kStateCount; j++) {
626  beagleMemCpy(offsetBeagleMatrix, offsetInMatrix, kStateCount);
627  offsetBeagleMatrix[kStateCount] = paddedValue;
628  offsetBeagleMatrix += kTransPaddedStateCount; // Skip padding
629  offsetInMatrix += kStateCount;
630  }
631  }
632 } else {
633  beagleMemCpy(gTransitionMatrices[matrixIndex], inMatrix,
634  kMatrixSize * kCategoryCount);
635 }
636  return BEAGLE_SUCCESS;
637 }
638 
639 BEAGLE_CPU_TEMPLATE
640 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::setTransitionMatrices(const int* matrixIndices,
641  const double* inMatrices,
642  const double* paddedValues,
643  int count) {
644  for (int k = 0; k < count; k++) {
645  const double* inMatrix = inMatrices + k*kStateCount*kStateCount*kCategoryCount;
646  int matrixIndex = matrixIndices[k];
647 
648 if (T_PAD != 0) {
649  const double* offsetInMatrix = inMatrix;
650  REALTYPE* offsetBeagleMatrix = gTransitionMatrices[matrixIndex];
651  for(int i = 0; i < kCategoryCount; i++) {
652  for(int j = 0; j < kStateCount; j++) {
653  beagleMemCpy(offsetBeagleMatrix, offsetInMatrix, kStateCount);
654  offsetBeagleMatrix[kStateCount] = paddedValues[k];
655  offsetBeagleMatrix += kTransPaddedStateCount; // Skip padding
656  offsetInMatrix += kStateCount;
657  }
658  }
659 } else {
660  beagleMemCpy(gTransitionMatrices[matrixIndex], inMatrix,
661  kMatrixSize * kCategoryCount);
662 }
663  }
664 
665  return BEAGLE_SUCCESS;
666 }
667 
668 BEAGLE_CPU_TEMPLATE
669 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::updateTransitionMatrices(int eigenIndex,
670  const int* probabilityIndices,
671  const int* firstDerivativeIndices,
672  const int* secondDerivativeIndices,
673  const double* edgeLengths,
674  int count) {
675  gEigenDecomposition->updateTransitionMatrices(eigenIndex,probabilityIndices,firstDerivativeIndices,secondDerivativeIndices,
676  edgeLengths,gCategoryRates,gTransitionMatrices,count);
677  return BEAGLE_SUCCESS;
678 }
679 
680 BEAGLE_CPU_TEMPLATE
681 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::updatePartials(const int* operations,
682  int count,
683  int cumulativeScaleIndex) {
684 
685  REALTYPE* cumulativeScaleBuffer = NULL;
686  if (cumulativeScaleIndex != BEAGLE_OP_NONE)
687  cumulativeScaleBuffer = gScaleBuffers[cumulativeScaleIndex];
688 
689  for (int op = 0; op < count; op++) {
690  if (DEBUGGING_OUTPUT) {
691  std::cerr << "op[0]= " << operations[0] << "\n";
692  std::cerr << "op[1]= " << operations[1] << "\n";
693  std::cerr << "op[2]= " << operations[2] << "\n";
694  std::cerr << "op[3]= " << operations[3] << "\n";
695  std::cerr << "op[4]= " << operations[4] << "\n";
696  std::cerr << "op[5]= " << operations[5] << "\n";
697  std::cerr << "op[6]= " << operations[6] << "\n";
698  }
699 
700  const int parIndex = operations[op * 7];
701  const int writeScalingIndex = operations[op * 7 + 1];
702  const int readScalingIndex = operations[op * 7 + 2];
703  const int child1Index = operations[op * 7 + 3];
704  const int child1TransMatIndex = operations[op * 7 + 4];
705  const int child2Index = operations[op * 7 + 5];
706  const int child2TransMatIndex = operations[op * 7 + 6];
707 
708  const REALTYPE* partials1 = gPartials[child1Index];
709  const REALTYPE* partials2 = gPartials[child2Index];
710 
711  const int* tipStates1 = gTipStates[child1Index];
712  const int* tipStates2 = gTipStates[child2Index];
713 
714  const REALTYPE* matrices1 = gTransitionMatrices[child1TransMatIndex];
715  const REALTYPE* matrices2 = gTransitionMatrices[child2TransMatIndex];
716 
717  REALTYPE* destPartials = gPartials[parIndex];
718 
719  int rescale = BEAGLE_OP_NONE;
720  REALTYPE* scalingFactors = NULL;
721 
722  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
723  gActiveScalingFactors[parIndex - kTipCount] = 0;
724  if (tipStates1 == 0 && tipStates2 == 0)
725  rescale = 2;
726  } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
727  rescale = 1;
728  scalingFactors = gScaleBuffers[parIndex - kTipCount];
729  } else if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) { // TODO: this is a quick and dirty implementation just so it returns correct results
730  if (tipStates1 == 0 && tipStates2 == 0) {
731  rescale = 1;
732  removeScaleFactors(&readScalingIndex, 1, cumulativeScaleIndex);
733  scalingFactors = gScaleBuffers[writeScalingIndex];
734  }
735  } else if (writeScalingIndex >= 0) {
736  rescale = 1;
737  scalingFactors = gScaleBuffers[writeScalingIndex];
738  } else if (readScalingIndex >= 0) {
739  rescale = 0;
740  scalingFactors = gScaleBuffers[readScalingIndex];
741  }
742 
743  if (DEBUGGING_OUTPUT) {
744  std::cerr << "Rescale= " << rescale << " writeIndex= " << writeScalingIndex
745  << " readIndex = " << readScalingIndex << "\n";
746  }
747 
748  if (tipStates1 != NULL) {
749  if (tipStates2 != NULL ) {
750  if (rescale == 0) { // Use fixed scaleFactors
751  calcStatesStatesFixedScaling(destPartials, tipStates1, matrices1, tipStates2, matrices2,
752  scalingFactors);
753  } else {
754  // First compute without any scaling
755  calcStatesStates(destPartials, tipStates1, matrices1, tipStates2, matrices2);
756  if (rescale == 1) // Recompute scaleFactors
757  rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
758  }
759  } else {
760  if (rescale == 0) {
761  calcStatesPartialsFixedScaling(destPartials, tipStates1, matrices1, partials2, matrices2,
762  scalingFactors);
763  } else {
764  calcStatesPartials(destPartials, tipStates1, matrices1, partials2, matrices2);
765  if (rescale == 1)
766  rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
767  }
768  }
769  } else {
770  if (tipStates2 != NULL) {
771  if (rescale == 0) {
772  calcStatesPartialsFixedScaling(destPartials,tipStates2,matrices2,partials1,matrices1,
773  scalingFactors);
774  } else {
775  calcStatesPartials(destPartials, tipStates2, matrices2, partials1, matrices1);
776  if (rescale == 1)
777  rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
778  }
779  } else {
780  if (rescale == 2) {
781  int sIndex = parIndex - kTipCount;
782  calcPartialsPartialsAutoScaling(destPartials,partials1,matrices1,partials2,matrices2,
783  &gActiveScalingFactors[sIndex]);
784  if (gActiveScalingFactors[sIndex])
785  autoRescalePartials(destPartials, gAutoScaleBuffers[sIndex]);
786 
787  } else if (rescale == 0) {
788  calcPartialsPartialsFixedScaling(destPartials,partials1,matrices1,partials2,matrices2,
789  scalingFactors);
790  } else {
791  calcPartialsPartials(destPartials, partials1, matrices1, partials2, matrices2);
792  if (rescale == 1)
793  rescalePartials(destPartials,scalingFactors,cumulativeScaleBuffer,0);
794  }
795  }
796  }
797 
798  if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
799  int parScalingIndex = parIndex - kTipCount;
800  int child1ScalingIndex = child1Index - kTipCount;
801  int child2ScalingIndex = child2Index - kTipCount;
802  if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
803  int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
804  accumulateScaleFactors(scalingIndices, 2, parScalingIndex);
805  } else if (child1ScalingIndex >= 0) {
806  int scalingIndices[1] = {child1ScalingIndex};
807  accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
808  } else if (child2ScalingIndex >= 0) {
809  int scalingIndices[1] = {child2ScalingIndex};
810  accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
811  }
812  }
813 
814  if (DEBUGGING_OUTPUT) {
815  if (scalingFactors != NULL && rescale == 0) {
816  for(int i=0; i<kPatternCount; i++)
817  fprintf(stderr,"old scaleFactor[%d] = %.5f\n",i,scalingFactors[i]);
818  }
819  fprintf(stderr,"Result partials:\n");
820  for(int i = 0; i < kPartialsSize; i++)
821  fprintf(stderr,"destP[%d] = %.5f\n",i,destPartials[i]);
822  }
823  }
824 
825  return BEAGLE_SUCCESS;
826 }
827 
828 
829 BEAGLE_CPU_TEMPLATE
830 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::waitForPartials(const int* destinationPartials,
831  int destinationPartialsCount) {
832  return BEAGLE_SUCCESS;
833 }
834 
835 
836 BEAGLE_CPU_TEMPLATE
837  int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calculateRootLogLikelihoods(const int* bufferIndices,
838  const int* categoryWeightsIndices,
839  const int* stateFrequenciesIndices,
840  const int* cumulativeScaleIndices,
841  int count,
842  double* outSumLogLikelihood) {
843 
844  if (count == 1) {
845  // We treat this as a special case so that we don't have convoluted logic
846  // at the end of the loop over patterns
847  int cumulativeScalingFactorIndex;
848  if (kFlags & BEAGLE_FLAG_SCALING_AUTO)
849  cumulativeScalingFactorIndex = 0;
850  else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
851  cumulativeScalingFactorIndex = bufferIndices[0] - kTipCount;
852  else
853  cumulativeScalingFactorIndex = cumulativeScaleIndices[0];
854  return calcRootLogLikelihoods(bufferIndices[0], categoryWeightsIndices[0], stateFrequenciesIndices[0],
855  cumulativeScalingFactorIndex, outSumLogLikelihood);
856  }
857  else
858  {
859  return calcRootLogLikelihoodsMulti(bufferIndices, categoryWeightsIndices, stateFrequenciesIndices,
860  cumulativeScaleIndices, count, outSumLogLikelihood);
861  }
862 }
863 
864 BEAGLE_CPU_TEMPLATE
865 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoodsMulti(const int* bufferIndices,
866  const int* categoryWeightsIndices,
867  const int* stateFrequenciesIndices,
868  const int* scaleBufferIndices,
869  int count,
870  double* outSumLogLikelihood) {
871  // Here we do the 3 similar operations:
872  // 1. to set the lnL to the contribution of the first subset,
873  // 2. to add the lnL for other subsets up to the penultimate
874  // 3. to add the final subset and take the lnL
875  // This form of the calc would not work when count == 1 because
876  // we need operation 1 and 3 in the preceding list. This is not
877  // a problem, though as we deal with count == 1 in the previous
878  // branch.
879 
880  std::vector<int> indexMaxScale(kPatternCount);
881  std::vector<REALTYPE> maxScaleFactor(kPatternCount);
882 
883  int returnCode = BEAGLE_SUCCESS;
884 
885  for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
886  const int rootPartialIndex = bufferIndices[subsetIndex];
887  const REALTYPE* rootPartials = gPartials[rootPartialIndex];
888  const REALTYPE* frequencies = gStateFrequencies[stateFrequenciesIndices[subsetIndex]];
889  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]];
890  int u = 0;
891  int v = 0;
892  for (int k = 0; k < kPatternCount; k++) {
893  for (int i = 0; i < kStateCount; i++) {
894  integrationTmp[u] = rootPartials[v] * (REALTYPE) wt[0];
895  u++;
896  v++;
897  }
898  v += P_PAD;
899  }
900  for (int l = 1; l < kCategoryCount; l++) {
901  u = 0;
902  for (int k = 0; k < kPatternCount; k++) {
903  for (int i = 0; i < kStateCount; i++) {
904  integrationTmp[u] += rootPartials[v] * (REALTYPE) wt[l];
905  u++;
906  v++;
907  }
908  v += P_PAD;
909  }
910  }
911  u = 0;
912  for (int k = 0; k < kPatternCount; k++) {
913  REALTYPE sum = 0.0;
914  for (int i = 0; i < kStateCount; i++) {
915  sum += ((REALTYPE)frequencies[i]) * integrationTmp[u];
916  u++;
917  }
918 
919  // TODO: allow only some subsets to have scale indices
920  if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
921  int cumulativeScalingFactorIndex;
922  if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
923  cumulativeScalingFactorIndex = rootPartialIndex - kTipCount;
924  else
925  cumulativeScalingFactorIndex = scaleBufferIndices[subsetIndex];
926 
927  const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex];
928 
929  if (subsetIndex == 0) {
930  indexMaxScale[k] = 0;
931  maxScaleFactor[k] = cumulativeScaleFactors[k];
932  for (int j = 1; j < count; j++) {
933  REALTYPE tmpScaleFactor;
934  if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
935  tmpScaleFactor = gScaleBuffers[bufferIndices[j] - kTipCount][k];
936  else
937  tmpScaleFactor = gScaleBuffers[scaleBufferIndices[j]][k];
938 
939  if (tmpScaleFactor > maxScaleFactor[k]) {
940  indexMaxScale[k] = j;
941  maxScaleFactor[k] = tmpScaleFactor;
942  }
943  }
944  }
945 
946  if (subsetIndex != indexMaxScale[k])
947  sum *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k]));
948  }
949 
950  if (subsetIndex == 0) {
951  outLogLikelihoodsTmp[k] = sum;
952  } else if (subsetIndex == count - 1) {
953  REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sum;
954 
955  outLogLikelihoodsTmp[k] = log(tmpSum);
956  } else {
957  outLogLikelihoodsTmp[k] += sum;
958  }
959  }
960  }
961 
962  if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
963  for(int i=0; i<kPatternCount; i++)
964  outLogLikelihoodsTmp[i] += maxScaleFactor[i];
965  }
966 
967  *outSumLogLikelihood = 0.0;
968  for (int i = 0; i < kPatternCount; i++) {
969  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
970  }
971 
972  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
973  returnCode = BEAGLE_ERROR_FLOATING_POINT;
974 
975  return returnCode;
976 
977 }
978 
979 BEAGLE_CPU_TEMPLATE
980 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoods(const int bufferIndex,
981  const int categoryWeightsIndex,
982  const int stateFrequenciesIndex,
983  const int scalingFactorsIndex,
984  double* outSumLogLikelihood) {
985 
986  int returnCode = BEAGLE_SUCCESS;
987 
988  const REALTYPE* rootPartials = gPartials[bufferIndex];
989  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
990  const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
991  int u = 0;
992  int v = 0;
993  for (int k = 0; k < kPatternCount; k++) {
994  for (int i = 0; i < kStateCount; i++) {
995  integrationTmp[u] = rootPartials[v] * (REALTYPE) wt[0];
996  u++;
997  v++;
998  }
999  v += P_PAD;
1000  }
1001  for (int l = 1; l < kCategoryCount; l++) {
1002  u = 0;
1003  for (int k = 0; k < kPatternCount; k++) {
1004  for (int i = 0; i < kStateCount; i++) {
1005  integrationTmp[u] += rootPartials[v] * (REALTYPE) wt[l];
1006  u++;
1007  v++;
1008  }
1009  v += P_PAD;
1010  }
1011  }
1012  u = 0;
1013  for (int k = 0; k < kPatternCount; k++) {
1014  REALTYPE sum = 0.0;
1015  for (int i = 0; i < kStateCount; i++) {
1016  sum += freqs[i] * integrationTmp[u];
1017  u++;
1018  }
1019 
1020  outLogLikelihoodsTmp[k] = log(sum);
1021  }
1022 
1023  if (scalingFactorsIndex >= 0) {
1024  const REALTYPE* cumulativeScaleFactors = gScaleBuffers[scalingFactorsIndex];
1025  for(int i=0; i<kPatternCount; i++) {
1026  outLogLikelihoodsTmp[i] += cumulativeScaleFactors[i];
1027  }
1028  }
1029 
1030  *outSumLogLikelihood = 0.0;
1031  for (int i = 0; i < kPatternCount; i++) {
1032  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1033  }
1034 
1035  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1036  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1037 
1038 
1039  // TODO: merge the three kPatternCount loops above into one
1040 
1041  return returnCode;
1042 }
1043 
1044 BEAGLE_CPU_TEMPLATE
1045 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::accumulateScaleFactors(const int* scalingIndices,
1046  int count,
1047  int cumulativeScalingIndex) {
1048  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1049  REALTYPE* cumulativeScaleBuffer = gScaleBuffers[0];
1050  for(int j=0; j<kPatternCount; j++)
1051  cumulativeScaleBuffer[j] = 0;
1052  for(int i=0; i<count; i++) {
1053  int sIndex = scalingIndices[i] - kTipCount;
1054  if (gActiveScalingFactors[sIndex]) {
1055  const signed short* scaleBuffer = gAutoScaleBuffers[sIndex];
1056  for(int j=0; j<kPatternCount; j++) {
1057  cumulativeScaleBuffer[j] += M_LN2 * scaleBuffer[j];
1058  }
1059  }
1060  }
1061 
1062  } else {
1063  REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScalingIndex];
1064  for(int i=0; i<count; i++) {
1065  const REALTYPE* scaleBuffer = gScaleBuffers[scalingIndices[i]];
1066  for(int j=0; j<kPatternCount; j++) {
1067  if (kFlags & BEAGLE_FLAG_SCALERS_LOG)
1068  cumulativeScaleBuffer[j] += scaleBuffer[j];
1069  else
1070  cumulativeScaleBuffer[j] += log(scaleBuffer[j]);
1071  }
1072  }
1073 
1074  if (DEBUGGING_OUTPUT) {
1075  fprintf(stderr,"Accumulating %d scale buffers into #%d\n",count,cumulativeScalingIndex);
1076  for(int j=0; j<kPatternCount; j++) {
1077  fprintf(stderr,"cumulativeScaleBuffer[%d] = %2.5e\n",j,cumulativeScaleBuffer[j]);
1078  }
1079  }
1080  }
1081 
1082  return BEAGLE_SUCCESS;
1083 }
1084 
1085 BEAGLE_CPU_TEMPLATE
1086 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::removeScaleFactors(const int* scalingIndices,
1087  int count,
1088  int cumulativeScalingIndex) {
1089  REALTYPE* cumulativeScaleBuffer = gScaleBuffers[cumulativeScalingIndex];
1090  for(int i=0; i<count; i++) {
1091  const REALTYPE* scaleBuffer = gScaleBuffers[scalingIndices[i]];
1092  for(int j=0; j<kPatternCount; j++) {
1093  if (kFlags & BEAGLE_FLAG_SCALERS_LOG)
1094  cumulativeScaleBuffer[j] -= scaleBuffer[j];
1095  else
1096  cumulativeScaleBuffer[j] -= log(scaleBuffer[j]);
1097  }
1098  }
1099 
1100  return BEAGLE_SUCCESS;
1101 }
1102 
1103 BEAGLE_CPU_TEMPLATE
1104 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
1105  //memcpy(gScaleBuffers[cumulativeScalingIndex],zeros,sizeof(double) * kPatternCount);
1106 
1107  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1108  memset(gScaleBuffers[cumulativeScalingIndex], 0, sizeof(signed short) * kPaddedPatternCount);
1109  } else {
1110  memset(gScaleBuffers[cumulativeScalingIndex], 0, sizeof(REALTYPE) * kPaddedPatternCount);
1111  }
1112  return BEAGLE_SUCCESS;
1113 }
1114 
1115 BEAGLE_CPU_TEMPLATE
1116 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::copyScaleFactors(int destScalingIndex,
1117  int srcScalingIndex) {
1118  memcpy(gScaleBuffers[destScalingIndex],gScaleBuffers[srcScalingIndex],sizeof(REALTYPE) * kPatternCount);
1119 
1120  return BEAGLE_SUCCESS;
1121 }
1122 
1123 BEAGLE_CPU_TEMPLATE
1124  int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calculateEdgeLogLikelihoods(const int* parentBufferIndices,
1125  const int* childBufferIndices,
1126  const int* probabilityIndices,
1127  const int* firstDerivativeIndices,
1128  const int* secondDerivativeIndices,
1129  const int* categoryWeightsIndices,
1130  const int* stateFrequenciesIndices,
1131  const int* cumulativeScaleIndices,
1132  int count,
1133  double* outSumLogLikelihood,
1134  double* outSumFirstDerivative,
1135  double* outSumSecondDerivative) {
1136  // TODO: implement for count > 1
1137 
1138  if (count == 1) {
1139  int cumulativeScalingFactorIndex;
1140  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1141  cumulativeScalingFactorIndex = 0;
1142  } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1143  cumulativeScalingFactorIndex = kInternalPartialsBufferCount;
1144  int child1ScalingIndex = parentBufferIndices[0] - kTipCount;
1145  int child2ScalingIndex = childBufferIndices[0] - kTipCount;
1146  resetScaleFactors(cumulativeScalingFactorIndex);
1147  if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
1148  int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
1149  accumulateScaleFactors(scalingIndices, 2, cumulativeScalingFactorIndex);
1150  } else if (child1ScalingIndex >= 0) {
1151  int scalingIndices[1] = {child1ScalingIndex};
1152  accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactorIndex);
1153  } else if (child2ScalingIndex >= 0) {
1154  int scalingIndices[1] = {child2ScalingIndex};
1155  accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactorIndex);
1156  }
1157  } else {
1158  cumulativeScalingFactorIndex = cumulativeScaleIndices[0];
1159  }
1160  if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL)
1161  return calcEdgeLogLikelihoods(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0],
1162  categoryWeightsIndices[0], stateFrequenciesIndices[0], cumulativeScalingFactorIndex,
1163  outSumLogLikelihood);
1164  else if (secondDerivativeIndices == NULL)
1165  return calcEdgeLogLikelihoodsFirstDeriv(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0],
1166  firstDerivativeIndices[0], categoryWeightsIndices[0], stateFrequenciesIndices[0],
1167  cumulativeScalingFactorIndex, outSumLogLikelihood, outSumFirstDerivative);
1168  else
1169  return calcEdgeLogLikelihoodsSecondDeriv(parentBufferIndices[0], childBufferIndices[0], probabilityIndices[0],
1170  firstDerivativeIndices[0], secondDerivativeIndices[0], categoryWeightsIndices[0],
1171  stateFrequenciesIndices[0], cumulativeScalingFactorIndex, outSumLogLikelihood,
1172  outSumFirstDerivative, outSumSecondDerivative);
1173  } else {
1174  if ((kFlags & BEAGLE_FLAG_SCALING_AUTO) || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
1175  fprintf(stderr,"BeagleCPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and auto/always scaling\n");
1176  }
1177 
1178  if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
1179  return calcEdgeLogLikelihoodsMulti(parentBufferIndices, childBufferIndices, probabilityIndices,
1180  categoryWeightsIndices, stateFrequenciesIndices, cumulativeScaleIndices, count,
1181  outSumLogLikelihood);
1182  } else {
1183  fprintf(stderr,"BeagleCPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and derivatives\n");
1184  }
1185 
1186 
1187  }
1188 
1189 }
1190 
1191 BEAGLE_CPU_TEMPLATE
1192 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoods(const int parIndex,
1193  const int childIndex,
1194  const int probIndex,
1195  const int categoryWeightsIndex,
1196  const int stateFrequenciesIndex,
1197  const int scalingFactorsIndex,
1198  double* outSumLogLikelihood) {
1199 
1200  assert(parIndex >= kTipCount);
1201 
1202  int returnCode = BEAGLE_SUCCESS;
1203 
1204  const REALTYPE* partialsParent = gPartials[parIndex];
1205  const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
1206  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
1207  const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
1208 
1209  memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1210 
1211 
1212  if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
1213 
1214  const int* statesChild = gTipStates[childIndex];
1215  int v = 0; // Index for parent partials
1216 
1217  for(int l = 0; l < kCategoryCount; l++) {
1218  int u = 0; // Index in resulting product-partials (summed over categories)
1219  const REALTYPE weight = wt[l];
1220  for(int k = 0; k < kPatternCount; k++) {
1221 
1222  const int stateChild = statesChild[k]; // DISCUSSION PT: Does it make sense to change the order of the partials,
1223  // so we can interchange the patterCount and categoryCount loop order?
1224  int w = l * kMatrixSize;
1225  for(int i = 0; i < kStateCount; i++) {
1226  integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
1227  u++;
1228 
1229  w += kTransPaddedStateCount;
1230  }
1231  v += kPartialsPaddedStateCount;
1232  }
1233  }
1234 
1235  } else { // Integrate against a partial at the child
1236 
1237  const REALTYPE* partialsChild = gPartials[childIndex];
1238  int v = 0;
1239  int stateCountModFour = (kStateCount / 4) * 4;
1240 
1241  for(int l = 0; l < kCategoryCount; l++) {
1242  int u = 0;
1243  const REALTYPE weight = wt[l];
1244  for(int k = 0; k < kPatternCount; k++) {
1245  int w = l * kMatrixSize;
1246  const REALTYPE* partialsChildPtr = &partialsChild[v];
1247  for(int i = 0; i < kStateCount; i++) {
1248  double sumOverJA = 0.0, sumOverJB = 0.0;
1249  int j = 0;
1250  const REALTYPE* transMatrixPtr = &transMatrix[w];
1251  for (; j < stateCountModFour; j += 4) {
1252  sumOverJA += transMatrixPtr[j + 0] * partialsChildPtr[j + 0];
1253  sumOverJB += transMatrixPtr[j + 1] * partialsChildPtr[j + 1];
1254  sumOverJA += transMatrixPtr[j + 2] * partialsChildPtr[j + 2];
1255  sumOverJB += transMatrixPtr[j + 3] * partialsChildPtr[j + 3];
1256 
1257  }
1258  for (; j < kStateCount; j++) {
1259  sumOverJA += transMatrixPtr[j] * partialsChildPtr[j];
1260  }
1261  // for(int j = 0; j < kStateCount; j++) {
1262  // sumOverJ += transMatrix[w] * partialsChild[v + j];
1263  // w++;
1264  // }
1265  integrationTmp[u] += (sumOverJA + sumOverJB) * partialsParent[v + i] * weight;
1266  u++;
1267 
1268  w += kStateCount;
1269 
1270  // increment for the extra column at the end
1271  w += T_PAD;
1272  }
1273  v += kPartialsPaddedStateCount;
1274  }
1275  }
1276  }
1277 
1278  int u = 0;
1279  for(int k = 0; k < kPatternCount; k++) {
1280  REALTYPE sumOverI = 0.0;
1281  for(int i = 0; i < kStateCount; i++) {
1282  sumOverI += freqs[i] * integrationTmp[u];
1283  u++;
1284  }
1285 
1286  outLogLikelihoodsTmp[k] = log(sumOverI);
1287  }
1288 
1289 
1290  if (scalingFactorsIndex != BEAGLE_OP_NONE) {
1291  const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
1292  for(int k=0; k < kPatternCount; k++)
1293  outLogLikelihoodsTmp[k] += scalingFactors[k];
1294  }
1295 
1296  *outSumLogLikelihood = 0.0;
1297  for (int i = 0; i < kPatternCount; i++) {
1298  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1299  }
1300 
1301  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1302  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1303 
1304  return returnCode;
1305 }
1306 
1307 BEAGLE_CPU_TEMPLATE
1308 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsMulti(const int* parentBufferIndices,
1309  const int* childBufferIndices,
1310  const int* probabilityIndices,
1311  const int* categoryWeightsIndices,
1312  const int* stateFrequenciesIndices,
1313  const int* scalingFactorsIndices,
1314  int count,
1315  double* outSumLogLikelihood) {
1316 
1317  std::vector<int> indexMaxScale(kPatternCount);
1318  std::vector<REALTYPE> maxScaleFactor(kPatternCount);
1319 
1320  int returnCode = BEAGLE_SUCCESS;
1321 
1322  for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
1323  const REALTYPE* partialsParent = gPartials[parentBufferIndices[subsetIndex]];
1324  const REALTYPE* transMatrix = gTransitionMatrices[probabilityIndices[subsetIndex]];
1325  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]];
1326  const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndices[subsetIndex]];
1327  int childIndex = childBufferIndices[subsetIndex];
1328 
1329  memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1330 
1331  if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
1332 
1333  const int* statesChild = gTipStates[childIndex];
1334  int v = 0; // Index for parent partials
1335 
1336  for(int l = 0; l < kCategoryCount; l++) {
1337  int u = 0; // Index in resulting product-partials (summed over categories)
1338  const REALTYPE weight = wt[l];
1339  for(int k = 0; k < kPatternCount; k++) {
1340 
1341  const int stateChild = statesChild[k]; // DISCUSSION PT: Does it make sense to change the order of the partials,
1342  // so we can interchange the patterCount and categoryCount loop order?
1343  int w = l * kMatrixSize;
1344  for(int i = 0; i < kStateCount; i++) {
1345  integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
1346  u++;
1347 
1348  w += kTransPaddedStateCount;
1349  }
1350  v += kPartialsPaddedStateCount;
1351  }
1352  }
1353  } else {
1354  const REALTYPE* partialsChild = gPartials[childIndex];
1355  int v = 0;
1356  int stateCountModFour = (kStateCount / 4) * 4;
1357 
1358  for(int l = 0; l < kCategoryCount; l++) {
1359  int u = 0;
1360  const REALTYPE weight = wt[l];
1361  for(int k = 0; k < kPatternCount; k++) {
1362  int w = l * kMatrixSize;
1363  const REALTYPE* partialsChildPtr = &partialsChild[v];
1364  for(int i = 0; i < kStateCount; i++) {
1365  double sumOverJA = 0.0, sumOverJB = 0.0;
1366  int j = 0;
1367  const REALTYPE* transMatrixPtr = &transMatrix[w];
1368  for (; j < stateCountModFour; j += 4) {
1369  sumOverJA += transMatrixPtr[j + 0] * partialsChildPtr[j + 0];
1370  sumOverJB += transMatrixPtr[j + 1] * partialsChildPtr[j + 1];
1371  sumOverJA += transMatrixPtr[j + 2] * partialsChildPtr[j + 2];
1372  sumOverJB += transMatrixPtr[j + 3] * partialsChildPtr[j + 3];
1373 
1374  }
1375  for (; j < kStateCount; j++) {
1376  sumOverJA += transMatrixPtr[j] * partialsChildPtr[j];
1377  }
1378 // for(int j = 0; j < kStateCount; j++) {
1379 // sumOverJ += transMatrix[w] * partialsChild[v + j];
1380 // w++;
1381 // }
1382  integrationTmp[u] += (sumOverJA + sumOverJB) * partialsParent[v + i] * weight;
1383  u++;
1384 
1385  w += kStateCount;
1386 
1387  // increment for the extra column at the end
1388  w += T_PAD;
1389  }
1390  v += kPartialsPaddedStateCount;
1391  }
1392  }
1393 
1394  }
1395  int u = 0;
1396  for(int k = 0; k < kPatternCount; k++) {
1397  REALTYPE sumOverI = 0.0;
1398  for(int i = 0; i < kStateCount; i++) {
1399  sumOverI += freqs[i] * integrationTmp[u];
1400  u++;
1401  }
1402 
1403  if (scalingFactorsIndices[0] != BEAGLE_OP_NONE) {
1404  int cumulativeScalingFactorIndex;
1405  cumulativeScalingFactorIndex = scalingFactorsIndices[subsetIndex];
1406 
1407  const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex];
1408 
1409  if (subsetIndex == 0) {
1410  indexMaxScale[k] = 0;
1411  maxScaleFactor[k] = cumulativeScaleFactors[k];
1412  for (int j = 1; j < count; j++) {
1413  REALTYPE tmpScaleFactor;
1414  tmpScaleFactor = gScaleBuffers[scalingFactorsIndices[j]][k];
1415 
1416  if (tmpScaleFactor > maxScaleFactor[k]) {
1417  indexMaxScale[k] = j;
1418  maxScaleFactor[k] = tmpScaleFactor;
1419  }
1420  }
1421  }
1422 
1423  if (subsetIndex != indexMaxScale[k])
1424  sumOverI *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k]));
1425  }
1426 
1427 
1428 
1429  if (subsetIndex == 0) {
1430  outLogLikelihoodsTmp[k] = sumOverI;
1431  } else if (subsetIndex == count - 1) {
1432  REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sumOverI;
1433 
1434  outLogLikelihoodsTmp[k] = log(tmpSum);
1435  } else {
1436  outLogLikelihoodsTmp[k] += sumOverI;
1437  }
1438 
1439  }
1440 
1441  }
1442 
1443  if (scalingFactorsIndices[0] != BEAGLE_OP_NONE) {
1444  for(int i=0; i<kPatternCount; i++)
1445  outLogLikelihoodsTmp[i] += maxScaleFactor[i];
1446  }
1447 
1448 
1449  *outSumLogLikelihood = 0.0;
1450  for (int i = 0; i < kPatternCount; i++) {
1451  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1452  }
1453 
1454  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1455  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1456 
1457  return returnCode;
1458 }
1459 
1460 
1461 BEAGLE_CPU_TEMPLATE
1462 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsFirstDeriv(const int parIndex,
1463  const int childIndex,
1464  const int probIndex,
1465  const int firstDerivativeIndex,
1466  const int categoryWeightsIndex,
1467  const int stateFrequenciesIndex,
1468  const int scalingFactorsIndex,
1469  double* outSumLogLikelihood,
1470  double* outSumFirstDerivative) {
1471 
1472  assert(parIndex >= kTipCount);
1473 
1474  int returnCode = BEAGLE_SUCCESS;
1475 
1476  const REALTYPE* partialsParent = gPartials[parIndex];
1477  const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
1478  const REALTYPE* firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex];
1479  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
1480  const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
1481 
1482 
1483  memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1484  memset(firstDerivTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1485 
1486  if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
1487 
1488  const int* statesChild = gTipStates[childIndex];
1489  int v = 0; // Index for parent partials
1490 
1491  for(int l = 0; l < kCategoryCount; l++) {
1492  int u = 0; // Index in resulting product-partials (summed over categories)
1493  const REALTYPE weight = wt[l];
1494  for(int k = 0; k < kPatternCount; k++) {
1495 
1496  const int stateChild = statesChild[k]; // DISCUSSION PT: Does it make sense to change the order of the partials,
1497  // so we can interchange the patterCount and categoryCount loop order?
1498  int w = l * kMatrixSize;
1499  for(int i = 0; i < kStateCount; i++) {
1500  integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
1501  firstDerivTmp[u] += firstDerivMatrix[w + stateChild] * partialsParent[v + i] * weight;
1502  u++;
1503 
1504  w += kTransPaddedStateCount;
1505  }
1506  v += kPartialsPaddedStateCount;
1507  }
1508  }
1509 
1510  } else { // Integrate against a partial at the child
1511 
1512  const REALTYPE* partialsChild = gPartials[childIndex];
1513  int v = 0;
1514 
1515  for(int l = 0; l < kCategoryCount; l++) {
1516  int u = 0;
1517  const REALTYPE weight = wt[l];
1518  for(int k = 0; k < kPatternCount; k++) {
1519  int w = l * kMatrixSize;
1520  for(int i = 0; i < kStateCount; i++) {
1521  double sumOverJ = 0.0;
1522  double sumOverJD1 = 0.0;
1523  for(int j = 0; j < kStateCount; j++) {
1524  sumOverJ += transMatrix[w] * partialsChild[v + j];
1525  sumOverJD1 += firstDerivMatrix[w] * partialsChild[v + j];
1526  w++;
1527  }
1528 
1529  // increment for the extra column at the end
1530  w += T_PAD;
1531 
1532  integrationTmp[u] += sumOverJ * partialsParent[v + i] * weight;
1533  firstDerivTmp[u] += sumOverJD1 * partialsParent[v + i] * weight;
1534  u++;
1535  }
1536  v += kPartialsPaddedStateCount;
1537  }
1538  }
1539  }
1540 
1541  int u = 0;
1542  for(int k = 0; k < kPatternCount; k++) {
1543  REALTYPE sumOverI = 0.0;
1544  REALTYPE sumOverID1 = 0.0;
1545  for(int i = 0; i < kStateCount; i++) {
1546  sumOverI += freqs[i] * integrationTmp[u];
1547  sumOverID1 += freqs[i] * firstDerivTmp[u];
1548  u++;
1549  }
1550 
1551  outLogLikelihoodsTmp[k] = log(sumOverI);
1552  outFirstDerivativesTmp[k] = sumOverID1 / sumOverI;
1553  }
1554 
1555 
1556  if (scalingFactorsIndex != BEAGLE_OP_NONE) {
1557  const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
1558  for(int k=0; k < kPatternCount; k++)
1559  outLogLikelihoodsTmp[k] += scalingFactors[k];
1560  }
1561 
1562  *outSumLogLikelihood = 0.0;
1563  *outSumFirstDerivative = 0.0;
1564  for (int i = 0; i < kPatternCount; i++) {
1565  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1566 
1567  *outSumFirstDerivative += outFirstDerivativesTmp[i] * gPatternWeights[i];
1568  }
1569 
1570  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1571  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1572 
1573  return returnCode;
1574 }
1575 
1576 BEAGLE_CPU_TEMPLATE
1577 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoodsSecondDeriv(const int parIndex,
1578  const int childIndex,
1579  const int probIndex,
1580  const int firstDerivativeIndex,
1581  const int secondDerivativeIndex,
1582  const int categoryWeightsIndex,
1583  const int stateFrequenciesIndex,
1584  const int scalingFactorsIndex,
1585  double* outSumLogLikelihood,
1586  double* outSumFirstDerivative,
1587  double* outSumSecondDerivative) {
1588 
1589  assert(parIndex >= kTipCount);
1590 
1591  int returnCode = BEAGLE_SUCCESS;
1592 
1593  const REALTYPE* partialsParent = gPartials[parIndex];
1594  const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
1595  const REALTYPE* firstDerivMatrix = gTransitionMatrices[firstDerivativeIndex];
1596  const REALTYPE* secondDerivMatrix = gTransitionMatrices[secondDerivativeIndex];
1597  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
1598  const REALTYPE* freqs = gStateFrequencies[stateFrequenciesIndex];
1599 
1600 
1601  memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1602  memset(firstDerivTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1603  memset(secondDerivTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
1604 
1605  if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
1606 
1607  const int* statesChild = gTipStates[childIndex];
1608  int v = 0; // Index for parent partials
1609 
1610  for(int l = 0; l < kCategoryCount; l++) {
1611  int u = 0; // Index in resulting product-partials (summed over categories)
1612  const REALTYPE weight = wt[l];
1613  for(int k = 0; k < kPatternCount; k++) {
1614 
1615  const int stateChild = statesChild[k]; // DISCUSSION PT: Does it make sense to change the order of the partials,
1616  // so we can interchange the patterCount and categoryCount loop order?
1617  int w = l * kMatrixSize;
1618  for(int i = 0; i < kStateCount; i++) {
1619  integrationTmp[u] += transMatrix[w + stateChild] * partialsParent[v + i] * weight;
1620  firstDerivTmp[u] += firstDerivMatrix[w + stateChild] * partialsParent[v + i] * weight;
1621  secondDerivTmp[u] += secondDerivMatrix[w + stateChild] * partialsParent[v + i] * weight;
1622  u++;
1623 
1624  w += kTransPaddedStateCount;
1625  }
1626  v += kPartialsPaddedStateCount;
1627  }
1628  }
1629 
1630  } else { // Integrate against a partial at the child
1631 
1632  const REALTYPE* partialsChild = gPartials[childIndex];
1633  int v = 0;
1634 
1635  for(int l = 0; l < kCategoryCount; l++) {
1636  int u = 0;
1637  const REALTYPE weight = wt[l];
1638  for(int k = 0; k < kPatternCount; k++) {
1639  int w = l * kMatrixSize;
1640  for(int i = 0; i < kStateCount; i++) {
1641  double sumOverJ = 0.0;
1642  double sumOverJD1 = 0.0;
1643  double sumOverJD2 = 0.0;
1644  for(int j = 0; j < kStateCount; j++) {
1645  sumOverJ += transMatrix[w] * partialsChild[v + j];
1646  sumOverJD1 += firstDerivMatrix[w] * partialsChild[v + j];
1647  sumOverJD2 += secondDerivMatrix[w] * partialsChild[v + j];
1648  w++;
1649  }
1650 
1651  // increment for the extra column at the end
1652  w += T_PAD;
1653 
1654  integrationTmp[u] += sumOverJ * partialsParent[v + i] * weight;
1655  firstDerivTmp[u] += sumOverJD1 * partialsParent[v + i] * weight;
1656  secondDerivTmp[u] += sumOverJD2 * partialsParent[v + i] * weight;
1657  u++;
1658  }
1659  v += kPartialsPaddedStateCount;
1660  }
1661  }
1662  }
1663 
1664  int u = 0;
1665  for(int k = 0; k < kPatternCount; k++) {
1666  REALTYPE sumOverI = 0.0;
1667  REALTYPE sumOverID1 = 0.0;
1668  REALTYPE sumOverID2 = 0.0;
1669  for(int i = 0; i < kStateCount; i++) {
1670  sumOverI += freqs[i] * integrationTmp[u];
1671  sumOverID1 += freqs[i] * firstDerivTmp[u];
1672  sumOverID2 += freqs[i] * secondDerivTmp[u];
1673  u++;
1674  }
1675 
1676  outLogLikelihoodsTmp[k] = log(sumOverI);
1677  outFirstDerivativesTmp[k] = sumOverID1 / sumOverI;
1678  outSecondDerivativesTmp[k] = sumOverID2 / sumOverI - outFirstDerivativesTmp[k] * outFirstDerivativesTmp[k];
1679  }
1680 
1681 
1682  if (scalingFactorsIndex != BEAGLE_OP_NONE) {
1683  const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
1684  for(int k=0; k < kPatternCount; k++)
1685  outLogLikelihoodsTmp[k] += scalingFactors[k];
1686  }
1687 
1688  *outSumLogLikelihood = 0.0;
1689  *outSumFirstDerivative = 0.0;
1690  *outSumSecondDerivative = 0.0;
1691  for (int i = 0; i < kPatternCount; i++) {
1692  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
1693 
1694  *outSumFirstDerivative += outFirstDerivativesTmp[i] * gPatternWeights[i];
1695 
1696  *outSumSecondDerivative += outSecondDerivativesTmp[i] * gPatternWeights[i];
1697  }
1698 
1699  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
1700  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1701 
1702  return returnCode;
1703 }
1704 
1705 
1706 
1707 BEAGLE_CPU_TEMPLATE
1708 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::block(void) {
1709  // Do nothing.
1710  return BEAGLE_SUCCESS;
1711 }
1712 
1714 // private methods
1715 
1716 /*
1717  * Re-scales the partial likelihoods such that the largest is one.
1718  */
1719 BEAGLE_CPU_TEMPLATE
1720 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::rescalePartials(REALTYPE* destP,
1721  REALTYPE* scaleFactors,
1722  REALTYPE* cumulativeScaleFactors,
1723  const int fillWithOnes) {
1724  if (DEBUGGING_OUTPUT) {
1725  std::cerr << "destP (before rescale): \n";// << destP << "\n";
1726  for(int i=0; i<kPartialsSize; i++)
1727  fprintf(stderr,"destP[%d] = %.5f\n",i,destP[i]);
1728  }
1729 
1730  // TODO None of the code below has been optimized.
1731  for (int k = 0; k < kPatternCount; k++) {
1732  REALTYPE max = 0;
1733  const int patternOffset = k * kPartialsPaddedStateCount;
1734  for (int l = 0; l < kCategoryCount; l++) {
1735  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1736  for (int i = 0; i < kStateCount; i++) {
1737  if(destP[offset] > max)
1738  max = destP[offset];
1739  offset++;
1740  }
1741  }
1742 
1743  if (max == 0)
1744  max = 1.0;
1745 
1746  REALTYPE oneOverMax = REALTYPE(1.0) / max;
1747  for (int l = 0; l < kCategoryCount; l++) {
1748  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1749  for (int i = 0; i < kStateCount; i++)
1750  destP[offset++] *= oneOverMax;
1751  }
1752 
1753  if (kFlags & BEAGLE_FLAG_SCALERS_LOG) {
1754  REALTYPE logMax = log(max);
1755  scaleFactors[k] = logMax;
1756  if( cumulativeScaleFactors != NULL )
1757  cumulativeScaleFactors[k] += logMax;
1758  } else {
1759  scaleFactors[k] = max;
1760  if( cumulativeScaleFactors != NULL )
1761  cumulativeScaleFactors[k] += log(max);
1762  }
1763  }
1764  if (DEBUGGING_OUTPUT) {
1765  for(int i=0; i<kPatternCount; i++)
1766  fprintf(stderr,"new scaleFactor[%d] = %.5f\n",i,scaleFactors[i]);
1767  }
1768 }
1769 
1770 BEAGLE_CPU_TEMPLATE
1771 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::autoRescalePartials(REALTYPE* destP,
1772  signed short* scaleFactors) {
1773 
1774 
1775  for (int k = 0; k < kPatternCount; k++) {
1776  REALTYPE max = 0;
1777  const int patternOffset = k * kPartialsPaddedStateCount;
1778  for (int l = 0; l < kCategoryCount; l++) {
1779  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1780  for (int i = 0; i < kStateCount; i++) {
1781  if(destP[offset] > max)
1782  max = destP[offset];
1783  offset++;
1784  }
1785  }
1786 
1787  int expMax;
1788  frexp(max, &expMax);
1789  scaleFactors[k] = expMax;
1790 
1791  if (expMax != 0) {
1792  for (int l = 0; l < kCategoryCount; l++) {
1793  int offset = l * kPaddedPatternCount * kPartialsPaddedStateCount + patternOffset;
1794  for (int i = 0; i < kStateCount; i++)
1795  destP[offset++] *= pow(2.0, -expMax);
1796  }
1797  }
1798  }
1799 }
1800 
1801 /*
1802  * Calculates partial likelihoods at a node when both children have states.
1803  */
1804 BEAGLE_CPU_TEMPLATE
1805 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP,
1806  const int* states1,
1807  const REALTYPE* matrices1,
1808  const int* states2,
1809  const REALTYPE* matrices2) {
1810 
1811 #pragma omp parallel for num_threads(kCategoryCount)
1812  for (int l = 0; l < kCategoryCount; l++) {
1813  int v = l*kPartialsPaddedStateCount*kPatternCount;
1814  for (int k = 0; k < kPatternCount; k++) {
1815  const int state1 = states1[k];
1816  const int state2 = states2[k];
1817  if (DEBUGGING_OUTPUT) {
1818  std::cerr << "calcStatesStates s1 = " << state1 << '\n';
1819  std::cerr << "calcStatesStates s2 = " << state2 << '\n';
1820  }
1821  int w = l * kMatrixSize;
1822  for (int i = 0; i < kStateCount; i++) {
1823  destP[v] = matrices1[w + state1] * matrices2[w + state2];
1824  v++;
1825 
1826  w += kTransPaddedStateCount;
1827  }
1828  v += P_PAD;
1829  }
1830  }
1831 }
1832 
1833 BEAGLE_CPU_TEMPLATE
1834 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesStatesFixedScaling(REALTYPE* destP,
1835  const int* child1States,
1836  const REALTYPE* child1TransMat,
1837  const int* child2States,
1838  const REALTYPE* child2TransMat,
1839  const REALTYPE* scaleFactors) {
1840 #pragma omp parallel for num_threads(kCategoryCount)
1841  for (int l = 0; l < kCategoryCount; l++) {
1842  int v = l*kPartialsPaddedStateCount*kPatternCount;
1843  for (int k = 0; k < kPatternCount; k++) {
1844  const int state1 = child1States[k];
1845  const int state2 = child2States[k];
1846  int w = l * kMatrixSize;
1847  REALTYPE scaleFactor = scaleFactors[k];
1848  for (int i = 0; i < kStateCount; i++) {
1849  destP[v] = child1TransMat[w + state1] *
1850  child2TransMat[w + state2] / scaleFactor;
1851  v++;
1852 
1853  w += kTransPaddedStateCount;
1854  }
1855  v += P_PAD;
1856  }
1857  }
1858 }
1859 
1860 /*
1861  * Calculates partial likelihoods at a node when one child has states and one has partials.
1862  */
1863 BEAGLE_CPU_TEMPLATE
1864 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartials(REALTYPE* destP,
1865  const int* states1,
1866  const REALTYPE* matrices1,
1867  const REALTYPE* partials2,
1868  const REALTYPE* matrices2) {
1869  int matrixIncr = kStateCount;
1870 
1871  // increment for the extra column at the end
1872  matrixIncr += T_PAD;
1873 
1874 
1875  int stateCountModFour = (kStateCount / 4) * 4;
1876 
1877 #pragma omp parallel for num_threads(kCategoryCount)
1878  for (int l = 0; l < kCategoryCount; l++) {
1879  int v = l*kPartialsPaddedStateCount*kPatternCount;
1880  int matrixOffset = l*kMatrixSize;
1881  const REALTYPE* partials2Ptr = &partials2[v];
1882  REALTYPE* destPtr = &destP[v];
1883  for (int k = 0; k < kPatternCount; k++) {
1884  int w = l * kMatrixSize;
1885  int state1 = states1[k];
1886  for (int i = 0; i < kStateCount; i++) {
1887  const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr;
1888  REALTYPE tmp = matrices1[w + state1];
1889  REALTYPE sumA = 0.0;
1890  REALTYPE sumB = 0.0;
1891  int j = 0;
1892  for (; j < stateCountModFour; j += 4) {
1893  sumA += matrices2Ptr[j + 0] * partials2Ptr[j + 0];
1894  sumB += matrices2Ptr[j + 1] * partials2Ptr[j + 1];
1895  sumA += matrices2Ptr[j + 2] * partials2Ptr[j + 2];
1896  sumB += matrices2Ptr[j + 3] * partials2Ptr[j + 3];
1897  }
1898  for (; j < kStateCount; j++) {
1899  sumA += matrices2Ptr[j] * partials2Ptr[j];
1900  }
1901 
1902  w += matrixIncr;
1903 
1904  *(destPtr++) = tmp * (sumA + sumB);
1905  }
1906  destPtr += P_PAD;
1907  partials2Ptr += kPartialsPaddedStateCount;
1908  }
1909  }
1910 }
1911 
1912 BEAGLE_CPU_TEMPLATE
1913 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcStatesPartialsFixedScaling(REALTYPE* destP,
1914  const int* states1,
1915  const REALTYPE* matrices1,
1916  const REALTYPE* partials2,
1917  const REALTYPE* matrices2,
1918  const REALTYPE* scaleFactors) {
1919  int matrixIncr = kStateCount;
1920 
1921  // increment for the extra column at the end
1922  matrixIncr += T_PAD;
1923 
1924  int stateCountModFour = (kStateCount / 4) * 4;
1925 
1926 #pragma omp parallel for num_threads(kCategoryCount)
1927  for (int l = 0; l < kCategoryCount; l++) {
1928  int v = l*kPartialsPaddedStateCount*kPatternCount;
1929  int matrixOffset = l*kMatrixSize;
1930  const REALTYPE* partials2Ptr = &partials2[v];
1931  REALTYPE* destPtr = &destP[v];
1932  for (int k = 0; k < kPatternCount; k++) {
1933  int w = l * kMatrixSize;
1934  int state1 = states1[k];
1935  REALTYPE oneOverScaleFactor = REALTYPE(1.0) / scaleFactors[k];
1936  for (int i = 0; i < kStateCount; i++) {
1937  const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr;
1938  REALTYPE tmp = matrices1[w + state1];
1939  REALTYPE sumA = 0.0;
1940  REALTYPE sumB = 0.0;
1941  int j = 0;
1942  for (; j < stateCountModFour; j += 4) {
1943  sumA += matrices2Ptr[j + 0] * partials2Ptr[j + 0];
1944  sumB += matrices2Ptr[j + 1] * partials2Ptr[j + 1];
1945  sumA += matrices2Ptr[j + 2] * partials2Ptr[j + 2];
1946  sumB += matrices2Ptr[j + 3] * partials2Ptr[j + 3];
1947  }
1948  for (; j < kStateCount; j++) {
1949  sumA += matrices2Ptr[j] * partials2Ptr[j];
1950  }
1951 
1952  w += matrixIncr;
1953 
1954  *(destPtr++) = tmp * (sumA + sumB) * oneOverScaleFactor;
1955  }
1956  destPtr += P_PAD;
1957  partials2Ptr += kPartialsPaddedStateCount;
1958  }
1959  }
1960 }
1961 
1962 /*
1963  * Calculates partial likelihoods at a node when both children have partials.
1964  */
1965 BEAGLE_CPU_TEMPLATE
1966 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartials(REALTYPE* destP,
1967  const REALTYPE* partials1,
1968  const REALTYPE* matrices1,
1969  const REALTYPE* partials2,
1970  const REALTYPE* matrices2) {
1971  int matrixIncr = kStateCount;
1972 
1973  // increment for the extra column at the end
1974  matrixIncr += T_PAD;
1975 
1976  int stateCountModFour = (kStateCount / 4) * 4;
1977 
1978 #pragma omp parallel for num_threads(kCategoryCount)
1979  for (int l = 0; l < kCategoryCount; l++) {
1980  int v = l*kPartialsPaddedStateCount*kPatternCount;
1981  int matrixOffset = l*kMatrixSize;
1982  const REALTYPE* partials1Ptr = &partials1[v];
1983  const REALTYPE* partials2Ptr = &partials2[v];
1984  REALTYPE* destPtr = &destP[v];
1985  for (int k = 0; k < kPatternCount; k++) {
1986 
1987  for (int i = 0; i < kStateCount; i++) {
1988  const REALTYPE* matrices1Ptr = matrices1 + matrixOffset + i * matrixIncr;
1989  const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr;
1990  REALTYPE sum1A = 0.0, sum2A = 0.0;
1991  REALTYPE sum1B = 0.0, sum2B = 0.0;
1992  int j = 0;
1993  for (; j < stateCountModFour; j += 4) {
1994  sum1A += matrices1Ptr[j + 0] * partials1Ptr[j + 0];
1995  sum2A += matrices2Ptr[j + 0] * partials2Ptr[j + 0];
1996 
1997  sum1B += matrices1Ptr[j + 1] * partials1Ptr[j + 1];
1998  sum2B += matrices2Ptr[j + 1] * partials2Ptr[j + 1];
1999 
2000  sum1A += matrices1Ptr[j + 2] * partials1Ptr[j + 2];
2001  sum2A += matrices2Ptr[j + 2] * partials2Ptr[j + 2];
2002 
2003  sum1B += matrices1Ptr[j + 3] * partials1Ptr[j + 3];
2004  sum2B += matrices2Ptr[j + 3] * partials2Ptr[j + 3];
2005  }
2006 
2007  for (; j < kStateCount; j++) {
2008  sum1A += matrices1Ptr[j] * partials1Ptr[j];
2009  sum2A += matrices2Ptr[j] * partials2Ptr[j];
2010  }
2011 
2012  *(destPtr++) = (sum1A + sum1B) * (sum2A + sum2B);
2013  }
2014  destPtr += P_PAD;
2015  partials1Ptr += kPartialsPaddedStateCount;
2016  partials2Ptr += kPartialsPaddedStateCount;
2017  }
2018  }
2019 }
2020 
2021 BEAGLE_CPU_TEMPLATE
2022 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsFixedScaling(REALTYPE* destP,
2023  const REALTYPE* partials1,
2024  const REALTYPE* matrices1,
2025  const REALTYPE* partials2,
2026  const REALTYPE* matrices2,
2027  const REALTYPE* scaleFactors) {
2028  int matrixIncr = kStateCount;
2029 
2030  // increment for the extra column at the end
2031  matrixIncr += T_PAD;
2032 
2033  int stateCountModFour = (kStateCount / 4) * 4;
2034 
2035 #pragma omp parallel for num_threads(kCategoryCount)
2036  for (int l = 0; l < kCategoryCount; l++) {
2037  int v = l*kPartialsPaddedStateCount*kPatternCount;
2038  int matrixOffset = l*kMatrixSize;
2039  const REALTYPE* partials1Ptr = &partials1[v];
2040  const REALTYPE* partials2Ptr = &partials2[v];
2041  REALTYPE* destPtr = &destP[v];
2042  for (int k = 0; k < kPatternCount; k++) {
2043  REALTYPE oneOverScaleFactor = REALTYPE(1.0) / scaleFactors[k];
2044  for (int i = 0; i < kStateCount; i++) {
2045  const REALTYPE* matrices1Ptr = matrices1 + matrixOffset + i * matrixIncr;
2046  const REALTYPE* matrices2Ptr = matrices2 + matrixOffset + i * matrixIncr;
2047  REALTYPE sum1A = 0.0, sum2A = 0.0;
2048  REALTYPE sum1B = 0.0, sum2B = 0.0;
2049  int j = 0;
2050  for (; j < stateCountModFour; j += 4) {
2051  sum1A += matrices1Ptr[j + 0] * partials1Ptr[j + 0];
2052  sum2A += matrices2Ptr[j + 0] * partials2Ptr[j + 0];
2053 
2054  sum1B += matrices1Ptr[j + 1] * partials1Ptr[j + 1];
2055  sum2B += matrices2Ptr[j + 1] * partials2Ptr[j + 1];
2056 
2057  sum1A += matrices1Ptr[j + 2] * partials1Ptr[j + 2];
2058  sum2A += matrices2Ptr[j + 2] * partials2Ptr[j + 2];
2059 
2060  sum1B += matrices1Ptr[j + 3] * partials1Ptr[j + 3];
2061  sum2B += matrices2Ptr[j + 3] * partials2Ptr[j + 3];
2062  }
2063 
2064  for (; j < kStateCount; j++) {
2065  sum1A += matrices1Ptr[j] * partials1Ptr[j];
2066  sum2A += matrices2Ptr[j] * partials2Ptr[j];
2067  }
2068 
2069  *(destPtr++) = (sum1A + sum1B) * (sum2A + sum2B) * oneOverScaleFactor;
2070  }
2071  destPtr += P_PAD;
2072  partials1Ptr += kPartialsPaddedStateCount;
2073  partials2Ptr += kPartialsPaddedStateCount;
2074  }
2075  }
2076 }
2077 
2078 BEAGLE_CPU_TEMPLATE
2079 void BeagleCPUImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsAutoScaling(REALTYPE* destP,
2080  const REALTYPE* partials1,
2081  const REALTYPE* matrices1,
2082  const REALTYPE* partials2,
2083  const REALTYPE* matrices2,
2084  int* activateScaling) {
2085 
2086 #pragma omp parallel for num_threads(kCategoryCount)
2087  for (int l = 0; l < kCategoryCount; l++) {
2088  int u = l*kPartialsPaddedStateCount*kPatternCount;
2089  int v = l*kPartialsPaddedStateCount*kPatternCount;
2090  for (int k = 0; k < kPatternCount; k++) {
2091  int w = l * kMatrixSize;
2092  for (int i = 0; i < kStateCount; i++) {
2093  REALTYPE sum1 = 0.0, sum2 = 0.0;
2094  for (int j = 0; j < kStateCount; j++) {
2095  sum1 += matrices1[w] * partials1[v + j];
2096  sum2 += matrices2[w] * partials2[v + j];
2097  w++;
2098  }
2099 
2100  // increment for the extra column at the end
2101  w += T_PAD;
2102 
2103  destP[u] = sum1 * sum2;
2104 
2105  if (*activateScaling == 0) {
2106  int expTmp;
2107  frexp(destP[u], &expTmp);
2108  if (abs(expTmp) > scalingExponentThreshhold)
2109  *activateScaling = 1;
2110  }
2111 
2112  u++;
2113  }
2114  u += P_PAD;
2115  v += kPartialsPaddedStateCount;
2116  }
2117  }
2118 }
2119 
2120 BEAGLE_CPU_TEMPLATE
2121 int BeagleCPUImpl<BEAGLE_CPU_GENERIC>::getPaddedPatternsModulus() {
2122  // Padding only necessary for SSE implementations that vectorize across patterns
2123  return 1; // No padding
2124 }
2125 
2126 BEAGLE_CPU_TEMPLATE
2127 void* BeagleCPUImpl<BEAGLE_CPU_GENERIC>::mallocAligned(size_t size) {
2128  void *ptr = (void *) NULL;
2129 
2130 #if defined (__APPLE__) || defined(WIN32)
2131  /*
2132  presumably malloc on OS X always returns
2133  a 16-byte aligned pointer
2134  */
2135  /* Windows malloc() always gives 16-byte alignment */
2136  assert(size > 0);
2137  ptr = malloc(size);
2138  if(ptr == (void*)NULL) {
2139  assert(0);
2140  }
2141 #else
2142  #if (T_PAD == 1)
2143  const size_t align = 32;
2144  #else // T_PAD == 2
2145  const size_t align = 16;
2146  #endif
2147  int res;
2148  res = posix_memalign(&ptr, align, size);
2149  if (res != 0) {
2150  assert(0);
2151  }
2152 #endif
2153 
2154  return ptr;
2155 }
2156 
2158 // BeagleCPUImplFactory public methods
2159 BEAGLE_CPU_FACTORY_TEMPLATE
2160 BeagleImpl* BeagleCPUImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(int tipCount,
2161  int partialsBufferCount,
2162  int compactBufferCount,
2163  int stateCount,
2164  int patternCount,
2165  int eigenBufferCount,
2166  int matrixBufferCount,
2167  int categoryCount,
2168  int scaleBufferCount,
2169  int resourceNumber,
2170  long preferenceFlags,
2171  long requirementFlags,
2172  int* errorCode) {
2173 
2174  BeagleImpl* impl = new BeagleCPUImpl<REALTYPE, T_PAD_DEFAULT, P_PAD_DEFAULT>();
2175 
2176  try {
2177  *errorCode =
2178  impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
2179  patternCount, eigenBufferCount, matrixBufferCount,
2180  categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags);
2181  if (*errorCode == BEAGLE_SUCCESS) {
2182  return impl;
2183  }
2184  delete impl;
2185  return NULL;
2186  }
2187  catch(...) {
2188  if (DEBUGGING_OUTPUT)
2189  std::cerr << "exception in initialize\n";
2190  delete impl;
2191  throw;
2192  }
2193 
2194  delete impl;
2195 
2196  return NULL;
2197 }
2198 
2199 
2200 BEAGLE_CPU_FACTORY_TEMPLATE
2201 const char* BeagleCPUImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() {
2202  return getBeagleCPUName<BEAGLE_CPU_FACTORY_GENERIC>();
2203 }
2204 
2205 BEAGLE_CPU_FACTORY_TEMPLATE
2206 const long BeagleCPUImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getFlags() {
2207  long flags = BEAGLE_FLAG_COMPUTATION_SYNCH |
2208  BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | BEAGLE_FLAG_SCALING_DYNAMIC |
2209  BEAGLE_FLAG_THREADING_NONE |
2210  BEAGLE_FLAG_PROCESSOR_CPU |
2211  BEAGLE_FLAG_VECTOR_NONE |
2212  BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
2213  BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
2214  BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
2215  if (DOUBLE_PRECISION)
2216  flags |= BEAGLE_FLAG_PRECISION_DOUBLE;
2217  else
2218  flags |= BEAGLE_FLAG_PRECISION_SINGLE;
2219  return flags;
2220 }
2221 
2222 } // namespace cpu
2223 } // namespace beagle
2224 
2225 #endif // BEAGLE_CPU_IMPL_GENERAL_HPP
2226