HMSBEAGLE  1.0.0
BeagleCPU4StateImpl.hpp
1 
2 /*
3  * BeagleCPU4StateImpl.cpp
4  * BEAGLE
5  *
6  * Copyright 2009 Phylogenetic Likelihood Working Group
7  *
8  * This file is part of BEAGLE.
9  *
10  * BEAGLE is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU Lesser General Public License as
12  * published by the Free Software Foundation, either version 3 of
13  * the License, or (at your option) any later version.
14  *
15  * BEAGLE is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with BEAGLE. If not, see
22  * <http://www.gnu.org/licenses/>.
23  *
24  * @author Andrew Rambaut
25  * @author Marc Suchard
26  * @author Daniel Ayres
27  * @author Mark Holder
28  * @author Aaron Darling
29  */
30 
31 #ifndef BEAGLE_CPU_4STATE_IMPL_HPP
32 #define BEAGLE_CPU_4STATE_IMPL_HPP
33 
34 #ifdef HAVE_CONFIG_H
35 #include "libhmsbeagle/config.h"
36 #endif
37 
38 #include <cstdio>
39 #include <cstdlib>
40 #include <iostream>
41 #include <cstring>
42 #include <cmath>
43 #include <cassert>
44 
45 #include "libhmsbeagle/beagle.h"
46 #include "libhmsbeagle/CPU/BeagleCPUImpl.h"
47 #include "libhmsbeagle/CPU/BeagleCPU4StateImpl.h"
48 
49 #define EXPERIMENTAL_OPENMP
50 
51 
52 #define OFFSET (4 + T_PAD) // For easy conversion between 4/5
53 
54 #define PREFETCH_MATRIX(num,matrices,w) \
55  REALTYPE m##num##00, m##num##01, m##num##02, m##num##03, \
56  m##num##10, m##num##11, m##num##12, m##num##13, \
57  m##num##20, m##num##21, m##num##22, m##num##23, \
58  m##num##30, m##num##31, m##num##32, m##num##33; \
59  m##num##00 = matrices[w + OFFSET*0 + 0]; \
60  m##num##01 = matrices[w + OFFSET*0 + 1]; \
61  m##num##02 = matrices[w + OFFSET*0 + 2]; \
62  m##num##03 = matrices[w + OFFSET*0 + 3]; \
63  m##num##10 = matrices[w + OFFSET*1 + 0]; \
64  m##num##11 = matrices[w + OFFSET*1 + 1]; \
65  m##num##12 = matrices[w + OFFSET*1 + 2]; \
66  m##num##13 = matrices[w + OFFSET*1 + 3]; \
67  m##num##20 = matrices[w + OFFSET*2 + 0]; \
68  m##num##21 = matrices[w + OFFSET*2 + 1]; \
69  m##num##22 = matrices[w + OFFSET*2 + 2]; \
70  m##num##23 = matrices[w + OFFSET*2 + 3]; \
71  m##num##30 = matrices[w + OFFSET*3 + 0]; \
72  m##num##31 = matrices[w + OFFSET*3 + 1]; \
73  m##num##32 = matrices[w + OFFSET*3 + 2]; \
74  m##num##33 = matrices[w + OFFSET*3 + 3];
75 
76 #define PREFETCH_PARTIALS(num,partials,v) \
77  REALTYPE p##num##0, p##num##1, p##num##2, p##num##3; \
78  p##num##0 = partials[v + 0]; \
79  p##num##1 = partials[v + 1]; \
80  p##num##2 = partials[v + 2]; \
81  p##num##3 = partials[v + 3];
82 
83 //#define DO_INTEGRATION(num) \
84 // REALTYPE sum##num##0, sum##num##1, sum##num##2, sum##num##3; \
85 // sum##num##0 = m##num##00 * p##num##0; \
86 // sum##num##1 = m##num##10 * p##num##0; \
87 // sum##num##2 = m##num##20 * p##num##0; \
88 // sum##num##3 = m##num##30 * p##num##0; \
89 // \
90 // sum##num##0 += m##num##01 * p##num##1; \
91 // sum##num##1 += m##num##11 * p##num##1; \
92 // sum##num##2 += m##num##21 * p##num##1; \
93 // sum##num##3 += m##num##31 * p##num##1; \
94 // \
95 // sum##num##0 += m##num##02 * p##num##2; \
96 // sum##num##1 += m##num##12 * p##num##2; \
97 // sum##num##2 += m##num##22 * p##num##2; \
98 // sum##num##3 += m##num##32 * p##num##2; \
99 // \
100 // sum##num##0 += m##num##03 * p##num##3; \
101 // sum##num##1 += m##num##13 * p##num##3; \
102 // sum##num##2 += m##num##23 * p##num##3; \
103 // sum##num##3 += m##num##33 * p##num##3;
104 
105 #define DO_INTEGRATION(num) \
106  REALTYPE sum##num##0, sum##num##1, sum##num##2, sum##num##3; \
107  sum##num##0 = m##num##00 * p##num##0 + \
108  m##num##01 * p##num##1 + \
109  m##num##02 * p##num##2 + \
110  m##num##03 * p##num##3; \
111  \
112  sum##num##1 = m##num##10 * p##num##0 + \
113  m##num##11 * p##num##1 + \
114  m##num##12 * p##num##2 + \
115  m##num##13 * p##num##3; \
116  \
117  sum##num##2 = m##num##20 * p##num##0 + \
118  m##num##21 * p##num##1 + \
119  m##num##22 * p##num##2 + \
120  m##num##23 * p##num##3; \
121 \
122  sum##num##3 = m##num##30 * p##num##0 + \
123  m##num##31 * p##num##1 + \
124  m##num##32 * p##num##2 + \
125  m##num##33 * p##num##3;
126 
127 
128 namespace beagle {
129 namespace cpu {
130 
131 BEAGLE_CPU_FACTORY_TEMPLATE
132 inline const char* getBeagleCPU4StateName(){ return "CPU-4State-Unknown"; };
133 
134 template<>
135 inline const char* getBeagleCPU4StateName<double>(){ return "CPU-4State-Double"; };
136 
137 template<>
138 inline const char* getBeagleCPU4StateName<float>(){ return "CPU-4State-Single"; };
139 
140 BEAGLE_CPU_TEMPLATE
141 BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::~BeagleCPU4StateImpl() {
142  // free all that stuff...
143  // If you delete partials, make sure not to delete the last element
144  // which is TEMP_SCRATCH_PARTIAL twice.
145 }
146 
148 // private methods
149 
150 /*
151  * Calculates partial likelihoods at a node when both children have states.
152  */
153 BEAGLE_CPU_TEMPLATE
154 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcStatesStates(REALTYPE* destP,
155  const int* states1,
156  const REALTYPE* matrices1,
157  const int* states2,
158  const REALTYPE* matrices2) {
159 
160 #pragma omp parallel for num_threads(kCategoryCount)
161  for (int l = 0; l < kCategoryCount; l++) {
162  int v = l*4*kPaddedPatternCount;
163  int w = l*4*OFFSET;
164 
165  for (int k = 0; k < kPatternCount; k++) {
166 
167  const int state1 = states1[k];
168  const int state2 = states2[k];
169 
170  destP[v ] = matrices1[w + state1] *
171  matrices2[w + state2];
172  destP[v + 1] = matrices1[w + OFFSET*1 + state1] *
173  matrices2[w + OFFSET*1 + state2];
174  destP[v + 2] = matrices1[w + OFFSET*2 + state1] *
175  matrices2[w + OFFSET*2 + state2];
176  destP[v + 3] = matrices1[w + OFFSET*3 + state1] *
177  matrices2[w + OFFSET*3 + state2];
178  v += 4;
179  }
180  }
181 }
182 
183 BEAGLE_CPU_TEMPLATE
184 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcStatesStatesFixedScaling(REALTYPE* destP,
185  const int* states1,
186  const REALTYPE* matrices1,
187  const int* states2,
188  const REALTYPE* matrices2,
189  const REALTYPE* scaleFactors) {
190 
191 #pragma omp parallel for num_threads(kCategoryCount)
192  for (int l = 0; l < kCategoryCount; l++) {
193  int v = l*4*kPaddedPatternCount;
194  int w = l*4*OFFSET;
195 
196  for (int k = 0; k < kPatternCount; k++) {
197 
198  const int state1 = states1[k];
199  const int state2 = states2[k];
200  const REALTYPE scaleFactor = scaleFactors[k];
201 
202  destP[v ] = matrices1[w + state1] *
203  matrices2[w + state2] / scaleFactor;
204  destP[v + 1] = matrices1[w + OFFSET*1 + state1] *
205  matrices2[w + OFFSET*1 + state2] / scaleFactor;
206  destP[v + 2] = matrices1[w + OFFSET*2 + state1] *
207  matrices2[w + OFFSET*2 + state2] / scaleFactor;
208  destP[v + 3] = matrices1[w + OFFSET*3 + state1] *
209  matrices2[w + OFFSET*3 + state2] / scaleFactor;
210  v += 4;
211  }
212  }
213 }
214 
215 /*
216  * Calculates partial likelihoods at a node when one child has states and one has partials.
217  */
218 BEAGLE_CPU_TEMPLATE
219 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcStatesPartials(REALTYPE* destP,
220  const int* states1,
221  const REALTYPE* matrices1,
222  const REALTYPE* partials2,
223  const REALTYPE* matrices2) {
224 
225 #pragma omp parallel for num_threads(kCategoryCount)
226  for (int l = 0; l < kCategoryCount; l++) {
227  int u = l*4*kPaddedPatternCount;
228  int w = l*4*OFFSET;
229 
230  PREFETCH_MATRIX(2,matrices2,w);
231 
232  for (int k = 0; k < kPatternCount; k++) {
233 
234  const int state1 = states1[k];
235 
236  PREFETCH_PARTIALS(2,partials2,u);
237 
238  DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23;
239 
240  destP[u ] = matrices1[w + state1] * sum20;
241  destP[u + 1] = matrices1[w + OFFSET*1 + state1] * sum21;
242  destP[u + 2] = matrices1[w + OFFSET*2 + state1] * sum22;
243  destP[u + 3] = matrices1[w + OFFSET*3 + state1] * sum23;
244 
245  u += 4;
246  }
247  }
248 }
249 
250 BEAGLE_CPU_TEMPLATE
251 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcStatesPartialsFixedScaling(REALTYPE* destP,
252  const int* states1,
253  const REALTYPE* matrices1,
254  const REALTYPE* partials2,
255  const REALTYPE* matrices2,
256  const REALTYPE* scaleFactors) {
257 
258 #pragma omp parallel for num_threads(kCategoryCount)
259  for (int l = 0; l < kCategoryCount; l++) {
260  int u = l*4*kPaddedPatternCount;
261  int w = l*4*OFFSET;
262 
263  PREFETCH_MATRIX(2,matrices2,w);
264 
265  for (int k = 0; k < kPatternCount; k++) {
266 
267  const int state1 = states1[k];
268  const REALTYPE scaleFactor = scaleFactors[k];
269 
270  PREFETCH_PARTIALS(2,partials2,u);
271 
272  DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23
273 
274  destP[u ] = matrices1[w + state1] * sum20 / scaleFactor;
275  destP[u + 1] = matrices1[w + OFFSET*1 + state1] * sum21 / scaleFactor;
276  destP[u + 2] = matrices1[w + OFFSET*2 + state1] * sum22 / scaleFactor;
277  destP[u + 3] = matrices1[w + OFFSET*3 + state1] * sum23 / scaleFactor;
278 
279  u += 4;
280  }
281  }
282 }
283 
284 BEAGLE_CPU_TEMPLATE
285 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartials(REALTYPE* destP,
286  const REALTYPE* partials1,
287  const REALTYPE* matrices1,
288  const REALTYPE* partials2,
289  const REALTYPE* matrices2) {
290 
291 
292 #pragma omp parallel for num_threads(kCategoryCount)
293  for (int l = 0; l < kCategoryCount; l++) {
294  int u = l*4*kPaddedPatternCount;
295  int w = l*4*OFFSET;
296 
297  PREFETCH_MATRIX(1,matrices1,w);
298  PREFETCH_MATRIX(2,matrices2,w);
299  for (int k = 0; k < kPatternCount; k++) {
300  PREFETCH_PARTIALS(1,partials1,u);
301  PREFETCH_PARTIALS(2,partials2,u);
302 
303  DO_INTEGRATION(1); // defines sum10, sum11, sum12, sum13
304  DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23
305 
306  // Final results
307  destP[u ] = sum10 * sum20;
308  destP[u + 1] = sum11 * sum21;
309  destP[u + 2] = sum12 * sum22;
310  destP[u + 3] = sum13 * sum23;
311 
312  u += 4;
313 
314  }
315  }
316 }
317 
318 BEAGLE_CPU_TEMPLATE
319 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsAutoScaling(REALTYPE* destP,
320  const REALTYPE* partials1,
321  const REALTYPE* matrices1,
322  const REALTYPE* partials2,
323  const REALTYPE* matrices2,
324  int* activateScaling) {
325 
326 
327 #pragma omp parallel for num_threads(kCategoryCount)
328  for (int l = 0; l < kCategoryCount; l++) {
329  int u = l*4*kPaddedPatternCount;
330  int w = l*4*OFFSET;
331 
332  PREFETCH_MATRIX(1,matrices1,w);
333  PREFETCH_MATRIX(2,matrices2,w);
334  for (int k = 0; k < kPatternCount; k++) {
335  PREFETCH_PARTIALS(1,partials1,u);
336  PREFETCH_PARTIALS(2,partials2,u);
337 
338  DO_INTEGRATION(1); // defines sum10, sum11, sum12, sum13
339  DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23
340 
341  // Final results
342  destP[u ] = sum10 * sum20;
343  destP[u + 1] = sum11 * sum21;
344  destP[u + 2] = sum12 * sum22;
345  destP[u + 3] = sum13 * sum23;
346 
347  if (*activateScaling == 0) {
348  int expTmp;
349  int expMax;
350  frexp(destP[u], &expMax);
351  frexp(destP[u + 1], &expTmp);
352  if (abs(expTmp) > abs(expMax))
353  expMax = expTmp;
354  frexp(destP[u + 2], &expTmp);
355  if (abs(expTmp) > abs(expMax))
356  expMax = expTmp;
357  frexp(destP[u + 3], &expTmp);
358  if (abs(expTmp) > abs(expMax))
359  expMax = expTmp;
360 
361  if(abs(expMax) > scalingExponentThreshhold) {
362  *activateScaling = 1;
363  }
364  }
365 
366  u += 4;
367 
368  }
369  }
370 }
371 
372 
373 BEAGLE_CPU_TEMPLATE
374 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcPartialsPartialsFixedScaling(REALTYPE* destP,
375  const REALTYPE* partials1,
376  const REALTYPE* matrices1,
377  const REALTYPE* partials2,
378  const REALTYPE* matrices2,
379  const REALTYPE* scaleFactors) {
380 
381 #pragma omp parallel for num_threads(kCategoryCount)
382  for (int l = 0; l < kCategoryCount; l++) {
383  int u = l*4*kPaddedPatternCount;
384  int w = l*4*OFFSET;
385 
386  PREFETCH_MATRIX(1,matrices1,w);
387  PREFETCH_MATRIX(2,matrices2,w);
388 
389  for (int k = 0; k < kPatternCount; k++) {
390 
391  // Prefetch scale factor
392  const REALTYPE scaleFactor = scaleFactors[k];
393 
394  PREFETCH_PARTIALS(1,partials1,u);
395  PREFETCH_PARTIALS(2,partials2,u);
396 
397  DO_INTEGRATION(1); // defines sum10, sum11, sum12, sum13
398  DO_INTEGRATION(2); // defines sum20, sum21, sum22, sum23
399 
400  // Final results
401  destP[u ] = sum10 * sum20 / scaleFactor;
402  destP[u + 1] = sum11 * sum21 / scaleFactor;
403  destP[u + 2] = sum12 * sum22 / scaleFactor;
404  destP[u + 3] = sum13 * sum23 / scaleFactor;
405 
406  u += 4;
407  }
408  }
409 }
410 
411 BEAGLE_CPU_TEMPLATE
412 int inline BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::integrateOutStatesAndScale(const REALTYPE* integrationTmp,
413  const int stateFrequenciesIndex,
414  const int scalingFactorsIndex,
415  double* outSumLogLikelihood) {
416 
417  int returnCode = BEAGLE_SUCCESS;
418 
419  register REALTYPE freq0, freq1, freq2, freq3; // Is it a good idea to specify 'register'?
420  freq0 = gStateFrequencies[stateFrequenciesIndex][0];
421  freq1 = gStateFrequencies[stateFrequenciesIndex][1];
422  freq2 = gStateFrequencies[stateFrequenciesIndex][2];
423  freq3 = gStateFrequencies[stateFrequenciesIndex][3];
424 
425  int u = 0;
426  for(int k = 0; k < kPatternCount; k++) {
427  REALTYPE sumOverI =
428  freq0 * integrationTmp[u ] +
429  freq1 * integrationTmp[u + 1] +
430  freq2 * integrationTmp[u + 2] +
431  freq3 * integrationTmp[u + 3];
432 
433  u += 4;
434 
435  outLogLikelihoodsTmp[k] = log(sumOverI);
436  }
437 
438  if (scalingFactorsIndex != BEAGLE_OP_NONE) {
439  const REALTYPE* scalingFactors = gScaleBuffers[scalingFactorsIndex];
440  for(int k=0; k < kPatternCount; k++) {
441  outLogLikelihoodsTmp[k] += scalingFactors[k];
442  }
443  }
444 
445  *outSumLogLikelihood = 0.0;
446  for(int k=0; k < kPatternCount; k++) {
447  *outSumLogLikelihood += outLogLikelihoodsTmp[k] * gPatternWeights[k];
448  }
449 
450  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
451  returnCode = BEAGLE_ERROR_FLOATING_POINT;
452 
453 
454  return returnCode;
455 }
456 
457 #define FAST_MAX(x,y) (x > y ? x : y)
458 //#define BEAGLE_TEST_OPTIMIZATION
459 /*
460  * Re-scales the partial likelihoods such that the largest is one.
461  */
462 BEAGLE_CPU_TEMPLATE
463 void BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::rescalePartials(REALTYPE* destP,
464  REALTYPE* scaleFactors,
465  REALTYPE* cumulativeScaleFactors,
466  const int fillWithOnes) {
467 
468  bool useLogScalars = kFlags & BEAGLE_FLAG_SCALERS_LOG;
469 
470  for (int k = 0; k < kPatternCount; k++) {
471  REALTYPE max = 0;
472  const int patternOffset = k * 4;
473  for (int l = 0; l < kCategoryCount; l++) {
474  int offset = l * kPaddedPatternCount * 4 + patternOffset;
475 
476 #ifdef BEAGLE_TEST_OPTIMIZATION
477  REALTYPE max01 = FAST_MAX(destP[offset + 0], destP[offset + 1]);
478  REALTYPE max23 = FAST_MAX(destP[offset + 2], destP[offset + 3]);
479  max = FAST_MAX(max, max01);
480  max = FAST_MAX(max, max23);
481 #else
482  #pragma unroll
483  for (int i = 0; i < 4; i++) {
484  if(destP[offset] > max)
485  max = destP[offset];
486  offset++;
487  }
488 #endif
489  }
490 
491  if (max == 0)
492  max = REALTYPE(1.0);
493 
494  REALTYPE oneOverMax = REALTYPE(1.0) / max;
495  for (int l = 0; l < kCategoryCount; l++) {
496  int offset = l * kPaddedPatternCount * 4 + patternOffset;
497  #pragma unroll
498  for (int i = 0; i < 4; i++)
499  destP[offset++] *= oneOverMax;
500  }
501 
502  if (useLogScalars) {
503  REALTYPE logMax = log(max);
504  scaleFactors[k] = logMax;
505  if( cumulativeScaleFactors != NULL )
506  cumulativeScaleFactors[k] += logMax;
507  } else {
508  scaleFactors[k] = max;
509  if( cumulativeScaleFactors != NULL )
510  cumulativeScaleFactors[k] += log(max);
511  }
512  }
513 }
514 
515 BEAGLE_CPU_TEMPLATE
516 int BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcEdgeLogLikelihoods(const int parIndex,
517  const int childIndex,
518  const int probIndex,
519  const int categoryWeightsIndex,
520  const int stateFrequenciesIndex,
521  const int scalingFactorsIndex,
522  double* outSumLogLikelihood) {
523  // TODO: implement derivatives for calculateEdgeLnL
524 
525  assert(parIndex >= kTipCount);
526 
527  const REALTYPE* partialsParent = gPartials[parIndex];
528  const REALTYPE* transMatrix = gTransitionMatrices[probIndex];
529  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
530 
531 
532  memset(integrationTmp, 0, (kPatternCount * kStateCount)*sizeof(REALTYPE));
533 
534  if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
535 
536  const int* statesChild = gTipStates[childIndex];
537  int v = 0; // Index for parent partials
538  int w = 0;
539  for(int l = 0; l < kCategoryCount; l++) {
540  int u = 0; // Index in resulting product-partials (summed over categories)
541  const REALTYPE weight = wt[l];
542  for(int k = 0; k < kPatternCount; k++) {
543 
544  const int stateChild = statesChild[k];
545 
546  integrationTmp[u ] += transMatrix[w + stateChild] * partialsParent[v ] * weight;
547  integrationTmp[u + 1] += transMatrix[w + OFFSET*1 + stateChild] * partialsParent[v + 1] * weight;
548  integrationTmp[u + 2] += transMatrix[w + OFFSET*2 + stateChild] * partialsParent[v + 2] * weight;
549  integrationTmp[u + 3] += transMatrix[w + OFFSET*3 + stateChild] * partialsParent[v + 3] * weight;
550 
551  u += 4;
552  v += 4;
553  }
554  w += OFFSET*4;
555  if (kExtraPatterns)
556  v += 4 * kExtraPatterns;
557  }
558 
559  } else { // Integrate against a partial at the child
560 
561  const REALTYPE* partialsChild = gPartials[childIndex];
562  #if 0//
563  int v = 0;
564  #endif
565  int w = 0;
566  for(int l = 0; l < kCategoryCount; l++) {
567  int u = 0;
568  #if 1//
569  int v = l*kPaddedPatternCount*4;
570  #endif
571  const REALTYPE weight = wt[l];
572 
573  PREFETCH_MATRIX(1,transMatrix,w);
574 
575  for(int k = 0; k < kPatternCount; k++) {
576 
577  const REALTYPE* partials1 = partialsChild;
578 
579  PREFETCH_PARTIALS(1,partials1,v);
580 
581  DO_INTEGRATION(1);
582 
583  integrationTmp[u ] += sum10 * partialsParent[v ] * weight;
584  integrationTmp[u + 1] += sum11 * partialsParent[v + 1] * weight;
585  integrationTmp[u + 2] += sum12 * partialsParent[v + 2] * weight;
586  integrationTmp[u + 3] += sum13 * partialsParent[v + 3] * weight;
587 
588  u += 4;
589  v += 4;
590  }
591  w += OFFSET*4;
592  #if 0//
593  if (kExtraPatterns)
594  v += 4 * kExtraPatterns;
595  #endif//
596  }
597  }
598 
599  return integrateOutStatesAndScale(integrationTmp, stateFrequenciesIndex, scalingFactorsIndex, outSumLogLikelihood);
600 }
601 
602 BEAGLE_CPU_TEMPLATE
603 int BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoods(const int bufferIndex,
604  const int categoryWeightsIndex,
605  const int stateFrequenciesIndex,
606  const int scalingFactorsIndex,
607  double* outSumLogLikelihood) {
608 
609  const REALTYPE* rootPartials = gPartials[bufferIndex];
610  assert(rootPartials);
611  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndex];
612 
613  int u = 0;
614  int v = 0;
615  const REALTYPE wt0 = wt[0];
616  for (int k = 0; k < kPatternCount; k++) {
617  integrationTmp[v ] = rootPartials[v ] * wt0;
618  integrationTmp[v + 1] = rootPartials[v + 1] * wt0;
619  integrationTmp[v + 2] = rootPartials[v + 2] * wt0;
620  integrationTmp[v + 3] = rootPartials[v + 3] * wt0;
621  v += 4;
622  }
623  for (int l = 1; l < kCategoryCount; l++) {
624  u = 0;
625  const REALTYPE wtl = wt[l];
626  for (int k = 0; k < kPatternCount; k++) {
627  integrationTmp[u ] += rootPartials[v ] * wtl;
628  integrationTmp[u + 1] += rootPartials[v + 1] * wtl;
629  integrationTmp[u + 2] += rootPartials[v + 2] * wtl;
630  integrationTmp[u + 3] += rootPartials[v + 3] * wtl;
631 
632  u += 4;
633  v += 4;
634  }
635  v += 4 * kExtraPatterns;
636  }
637 
638  return integrateOutStatesAndScale(integrationTmp, stateFrequenciesIndex, scalingFactorsIndex, outSumLogLikelihood);
639 }
640 
641 BEAGLE_CPU_TEMPLATE
642 int BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::calcRootLogLikelihoodsMulti(const int* bufferIndices,
643  const int* categoryWeightsIndices,
644  const int* stateFrequenciesIndices,
645  const int* scaleBufferIndices,
646  int count,
647  double* outSumLogLikelihood) {
648 
649  int returnCode = BEAGLE_SUCCESS;
650 
651  std::vector<int> indexMaxScale(kPatternCount);
652  std::vector<REALTYPE> maxScaleFactor(kPatternCount);
653 
654  for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
655  const int rootPartialIndex = bufferIndices[subsetIndex];
656  const REALTYPE* rootPartials = gPartials[rootPartialIndex];
657  const REALTYPE* frequencies = gStateFrequencies[stateFrequenciesIndices[subsetIndex]];
658  const REALTYPE* wt = gCategoryWeights[categoryWeightsIndices[subsetIndex]];
659  int u = 0;
660  int v = 0;
661 
662  const REALTYPE wt0 = wt[0];
663  for (int k = 0; k < kPatternCount; k++) {
664  integrationTmp[v ] = rootPartials[v ] * wt0;
665  integrationTmp[v + 1] = rootPartials[v + 1] * wt0;
666  integrationTmp[v + 2] = rootPartials[v + 2] * wt0;
667  integrationTmp[v + 3] = rootPartials[v + 3] * wt0;
668  v += 4;
669  }
670  for (int l = 1; l < kCategoryCount; l++) {
671  u = 0;
672  const REALTYPE wtl = wt[l];
673  for (int k = 0; k < kPatternCount; k++) {
674  integrationTmp[u ] += rootPartials[v ] * wtl;
675  integrationTmp[u + 1] += rootPartials[v + 1] * wtl;
676  integrationTmp[u + 2] += rootPartials[v + 2] * wtl;
677  integrationTmp[u + 3] += rootPartials[v + 3] * wtl;
678 
679  u += 4;
680  v += 4;
681  }
682  v += 4 * kExtraPatterns;
683  }
684 
685  register REALTYPE freq0, freq1, freq2, freq3; // Is it a good idea to specify 'register'?
686  freq0 = frequencies[0];
687  freq1 = frequencies[1];
688  freq2 = frequencies[2];
689  freq3 = frequencies[3];
690 
691  u = 0;
692  for (int k = 0; k < kPatternCount; k++) {
693  REALTYPE sum =
694  freq0 * integrationTmp[u ] +
695  freq1 * integrationTmp[u + 1] +
696  freq2 * integrationTmp[u + 2] +
697  freq3 * integrationTmp[u + 3];
698 
699  u += 4;
700 
701  // TODO: allow only some subsets to have scale indices
702  if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
703  int cumulativeScalingFactorIndex;
704  if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
705  cumulativeScalingFactorIndex = rootPartialIndex - kTipCount;
706  else
707  cumulativeScalingFactorIndex = scaleBufferIndices[subsetIndex];
708 
709  const REALTYPE* cumulativeScaleFactors = gScaleBuffers[cumulativeScalingFactorIndex];
710 
711  if (subsetIndex == 0) {
712  indexMaxScale[k] = 0;
713  maxScaleFactor[k] = cumulativeScaleFactors[k];
714  for (int j = 1; j < count; j++) {
715  REALTYPE tmpScaleFactor;
716  if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
717  tmpScaleFactor = gScaleBuffers[bufferIndices[j] - kTipCount][k];
718  else
719  tmpScaleFactor = gScaleBuffers[scaleBufferIndices[j]][k];
720 
721  if (tmpScaleFactor > maxScaleFactor[k]) {
722  indexMaxScale[k] = j;
723  maxScaleFactor[k] = tmpScaleFactor;
724  }
725  }
726  }
727 
728  if (subsetIndex != indexMaxScale[k])
729  sum *= exp((REALTYPE)(cumulativeScaleFactors[k] - maxScaleFactor[k]));
730  }
731 
732  if (subsetIndex == 0) {
733  outLogLikelihoodsTmp[k] = sum;
734  } else if (subsetIndex == count - 1) {
735  REALTYPE tmpSum = outLogLikelihoodsTmp[k] + sum;
736 
737  outLogLikelihoodsTmp[k] = log(tmpSum);
738  } else {
739  outLogLikelihoodsTmp[k] += sum;
740  }
741  }
742  }
743 
744  if (scaleBufferIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
745  for(int i=0; i<kPatternCount; i++)
746  outLogLikelihoodsTmp[i] += maxScaleFactor[i];
747  }
748 
749  *outSumLogLikelihood = 0.0;
750  for (int i = 0; i < kPatternCount; i++) {
751  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
752  }
753 
754  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
755  returnCode = BEAGLE_ERROR_FLOATING_POINT;
756 
757 
758  return returnCode;
759 }
760 
761 
762 BEAGLE_CPU_TEMPLATE
763 const char* BeagleCPU4StateImpl<BEAGLE_CPU_GENERIC>::getName() {
764  return getBeagleCPU4StateName<BEAGLE_CPU_FACTORY_GENERIC>();
765 }
766 
768 // BeagleCPUImplFactory public methods
769 
770 BEAGLE_CPU_FACTORY_TEMPLATE
771 BeagleImpl* BeagleCPU4StateImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(int tipCount,
772  int partialsBufferCount,
773  int compactBufferCount,
774  int stateCount,
775  int patternCount,
776  int eigenBufferCount,
777  int matrixBufferCount,
778  int categoryCount,
779  int scaleBufferCount,
780  int resourceNumber,
781  long preferenceFlags,
782  long requirementFlags,
783  int* errorCode) {
784 
785  if (stateCount != 4) {
786  return NULL;
787  }
788 
789  BeagleImpl* impl = new BeagleCPU4StateImpl<REALTYPE, T_PAD_DEFAULT, P_PAD_DEFAULT>();
790 
791  try {
792  if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
793  patternCount, eigenBufferCount, matrixBufferCount,
794  categoryCount,scaleBufferCount, resourceNumber,
795  preferenceFlags, requirementFlags) == 0)
796  return impl;
797  }
798  catch(...) {
799  if (DEBUGGING_OUTPUT)
800  std::cerr << "exception in initialize\n";
801  delete impl;
802  throw;
803  }
804 
805  delete impl;
806 
807  return NULL;
808 }
809 
810 BEAGLE_CPU_FACTORY_TEMPLATE
811 const char* BeagleCPU4StateImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() {
812  return getBeagleCPU4StateName<BEAGLE_CPU_FACTORY_GENERIC>();
813 }
814 
815 BEAGLE_CPU_FACTORY_TEMPLATE
816 const long BeagleCPU4StateImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getFlags() {
817  long flags = BEAGLE_FLAG_COMPUTATION_SYNCH |
818  BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
819  BEAGLE_FLAG_THREADING_NONE |
820  BEAGLE_FLAG_PROCESSOR_CPU |
821  BEAGLE_FLAG_VECTOR_NONE |
822  BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
823  BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
824  BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
825 
826  if (DOUBLE_PRECISION)
827  flags |= BEAGLE_FLAG_PRECISION_DOUBLE;
828  else
829  flags |= BEAGLE_FLAG_PRECISION_SINGLE;
830  return flags;
831 }
832 
833 } // namespace cpu
834 } // namespace beagle
835 
836 #endif // BEAGLE_CPU_4STATE_IMPL_HPP
837