27 #ifndef BEAGLE_CPU_4STATE_SSE_IMPL_HPP
28 #define BEAGLE_CPU_4STATE_SSE_IMPL_HPP
32 #include "libhmsbeagle/config.h"
42 #include "libhmsbeagle/beagle.h"
43 #include "libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.h"
44 #include "libhmsbeagle/CPU/SSEDefinitions.h"
48 #define SSE_PREFETCH_PARTIALS(dest, src, v) \
49 dest##0 = VEC_SPLAT(src[v + 0]); \
50 dest##1 = VEC_SPLAT(src[v + 1]); \
51 dest##2 = VEC_SPLAT(src[v + 2]); \
52 dest##3 = VEC_SPLAT(src[v + 3]);
53 #else // Load four partials in two 128-bit memory transactions
54 #define SSE_PREFETCH_PARTIALS(dest, src, v) \
55 V_Real tmp_##dest##01, tmp_##dest##23; \
56 tmp_##dest##01 = _mm_load_pd(&src[v + 0]); \
57 tmp_##dest##23 = _mm_load_pd(&src[v + 2]); \
58 dest##0 = _mm_shuffle_pd(tmp_##dest##01, tmp_##dest##01, _MM_SHUFFLE2(0,0)); \
59 dest##1 = _mm_shuffle_pd(tmp_##dest##01, tmp_##dest##01, _MM_SHUFFLE2(1,1)); \
60 dest##2 = _mm_shuffle_pd(tmp_##dest##23, tmp_##dest##23, _MM_SHUFFLE2(0,0)); \
61 dest##3 = _mm_shuffle_pd(tmp_##dest##23, tmp_##dest##23, _MM_SHUFFLE2(1,1));
65 #define SSE_PREFETCH_MATRICES(src_m1, src_m2, dest_vu_m1, dest_vu_m2) \
66 const double *m1 = (src_m1); \
67 const double *m2 = (src_m2); \
68 for (int i = 0; i < OFFSET; i++, m1++, m2++) { \
69 dest_vu_m1[i][0].x[0] = m1[0*OFFSET]; \
70 dest_vu_m1[i][0].x[1] = m1[1*OFFSET]; \
71 dest_vu_m2[i][0].x[0] = m2[0*OFFSET]; \
72 dest_vu_m2[i][0].x[1] = m2[1*OFFSET]; \
73 dest_vu_m1[i][1].x[0] = m1[2*OFFSET]; \
74 dest_vu_m1[i][1].x[1] = m1[3*OFFSET]; \
75 dest_vu_m2[i][1].x[0] = m2[2*OFFSET]; \
76 dest_vu_m2[i][1].x[1] = m2[3*OFFSET]; \
79 #define SSE_PREFETCH_MATRIX(src_m1, dest_vu_m1) \
80 const double *m1 = (src_m1); \
81 for (int i = 0; i < OFFSET; i++, m1++) { \
82 dest_vu_m1[i][0].x[0] = m1[0*OFFSET]; \
83 dest_vu_m1[i][0].x[1] = m1[1*OFFSET]; \
84 dest_vu_m1[i][1].x[0] = m1[2*OFFSET]; \
85 dest_vu_m1[i][1].x[1] = m1[3*OFFSET]; \
92 BEAGLE_CPU_FACTORY_TEMPLATE
93 inline const char* getBeagleCPU4StateSSEName(){
return "CPU-4State-SSE-Unknown"; };
96 inline const char* getBeagleCPU4StateSSEName<double>(){
return "CPU-4State-SSE-Double"; };
99 inline const char* getBeagleCPU4StateSSEName<float>(){
return "CPU-4State-SSE-Single"; };
105 BEAGLE_CPU_4_SSE_TEMPLATE
106 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesStates(
float* destP,
108 const float* matrices_q,
110 const float* matrices_r) {
112 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesStates(destP,
121 BEAGLE_CPU_4_SSE_TEMPLATE
122 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcStatesStates(
double* destP,
124 const double* matrices_q,
126 const double* matrices_r) {
128 VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
131 V_Real *destPvec = (V_Real *)destP;
133 for (
int l = 0; l < kCategoryCount; l++) {
135 SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
137 for (
int k = 0; k < kPatternCount; k++) {
139 const int state_q = states_q[k];
140 const int state_r = states_r[k];
142 *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, vu_mr[state_r][0].vx);
143 *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, vu_mr[state_r][1].vx);
149 destPvec += kExtraPatterns * 2;
157 BEAGLE_CPU_4_SSE_TEMPLATE
158 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartials(
float* destP,
160 const float* matrices_q,
161 const float* partials_r,
162 const float* matrices_r) {
163 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartials(
173 BEAGLE_CPU_4_SSE_TEMPLATE
174 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcStatesPartials(
double* destP,
176 const double* matrices_q,
177 const double* partials_r,
178 const double* matrices_r) {
183 VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
184 V_Real *destPvec = (V_Real *)destP;
185 V_Real destr_01, destr_23;
187 for (
int l = 0; l < kCategoryCount; l++) {
189 SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
191 for (
int k = 0; k < kPatternCount; k++) {
193 const int state_q = states_q[k];
194 V_Real vp0, vp1, vp2, vp3;
195 SSE_PREFETCH_PARTIALS(vp,partials_r,v);
197 destr_01 = VEC_MULT(vp0, vu_mr[0][0].vx);
198 destr_01 = VEC_MADD(vp1, vu_mr[1][0].vx, destr_01);
199 destr_01 = VEC_MADD(vp2, vu_mr[2][0].vx, destr_01);
200 destr_01 = VEC_MADD(vp3, vu_mr[3][0].vx, destr_01);
201 destr_23 = VEC_MULT(vp0, vu_mr[0][1].vx);
202 destr_23 = VEC_MADD(vp1, vu_mr[1][1].vx, destr_23);
203 destr_23 = VEC_MADD(vp2, vu_mr[2][1].vx, destr_23);
204 destr_23 = VEC_MADD(vp3, vu_mr[3][1].vx, destr_23);
206 *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, destr_01);
207 *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, destr_23);
212 if (kExtraPatterns) {
213 destPvec += kExtraPatterns * 2;
214 v += kExtraPatterns * 4;
219 BEAGLE_CPU_4_SSE_TEMPLATE
220 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartialsFixedScaling(
float* destP,
222 const float* __restrict matrices1,
223 const float* __restrict partials2,
224 const float* __restrict matrices2,
225 const float* __restrict scaleFactors) {
226 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartialsFixedScaling(
235 BEAGLE_CPU_4_SSE_TEMPLATE
236 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcStatesPartialsFixedScaling(
double* destP,
238 const double* __restrict matrices_q,
239 const double* __restrict partials_r,
240 const double* __restrict matrices_r,
241 const double* __restrict scaleFactors) {
247 VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
248 V_Real *destPvec = (V_Real *)destP;
249 V_Real destr_01, destr_23;
251 for (
int l = 0; l < kCategoryCount; l++) {
253 SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
255 for (
int k = 0; k < kPatternCount; k++) {
257 const V_Real scaleFactor = VEC_SPLAT(scaleFactors[k]);
259 const int state_q = states_q[k];
260 V_Real vp0, vp1, vp2, vp3;
261 SSE_PREFETCH_PARTIALS(vp,partials_r,v);
263 destr_01 = VEC_MULT(vp0, vu_mr[0][0].vx);
264 destr_01 = VEC_MADD(vp1, vu_mr[1][0].vx, destr_01);
265 destr_01 = VEC_MADD(vp2, vu_mr[2][0].vx, destr_01);
266 destr_01 = VEC_MADD(vp3, vu_mr[3][0].vx, destr_01);
267 destr_23 = VEC_MULT(vp0, vu_mr[0][1].vx);
268 destr_23 = VEC_MADD(vp1, vu_mr[1][1].vx, destr_23);
269 destr_23 = VEC_MADD(vp2, vu_mr[2][1].vx, destr_23);
270 destr_23 = VEC_MADD(vp3, vu_mr[3][1].vx, destr_23);
272 *destPvec++ = VEC_DIV(VEC_MULT(vu_mq[state_q][0].vx, destr_01), scaleFactor);
273 *destPvec++ = VEC_DIV(VEC_MULT(vu_mq[state_q][1].vx, destr_23), scaleFactor);
278 if (kExtraPatterns) {
279 destPvec += kExtraPatterns * 2;
280 v += kExtraPatterns * 4;
285 BEAGLE_CPU_4_SSE_TEMPLATE
286 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartials(
float* destP,
287 const float* partials_q,
288 const float* matrices_q,
289 const float* partials_r,
290 const float* matrices_r) {
292 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartials(destP,
299 BEAGLE_CPU_4_SSE_TEMPLATE
300 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartials(
double* destP,
301 const double* partials_q,
302 const double* matrices_q,
303 const double* partials_r,
304 const double* matrices_r) {
309 V_Real destq_01, destq_23, destr_01, destr_23;
310 VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
311 V_Real *destPvec = (V_Real *)destP;
313 for (
int l = 0; l < kCategoryCount; l++) {
316 SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
318 for (
int k = 0; k < kPatternCount; k++) {
320 # if 1 && !defined(_WIN32)
321 __builtin_prefetch (&partials_q[v+64]);
322 __builtin_prefetch (&partials_r[v+64]);
326 V_Real vpq_0, vpq_1, vpq_2, vpq_3;
327 SSE_PREFETCH_PARTIALS(vpq_,partials_q,v);
329 V_Real vpr_0, vpr_1, vpr_2, vpr_3;
330 SSE_PREFETCH_PARTIALS(vpr_,partials_r,v);
335 destq_01 = VEC_MULT(vpq_0, vu_mq[0][0].vx);
336 destq_01 = VEC_MADD(vpq_1, vu_mq[1][0].vx, destq_01);
337 destq_01 = VEC_MADD(vpq_2, vu_mq[2][0].vx, destq_01);
338 destq_01 = VEC_MADD(vpq_3, vu_mq[3][0].vx, destq_01);
339 destq_23 = VEC_MULT(vpq_0, vu_mq[0][1].vx);
340 destq_23 = VEC_MADD(vpq_1, vu_mq[1][1].vx, destq_23);
341 destq_23 = VEC_MADD(vpq_2, vu_mq[2][1].vx, destq_23);
342 destq_23 = VEC_MADD(vpq_3, vu_mq[3][1].vx, destq_23);
344 destr_01 = VEC_MULT(vpr_0, vu_mr[0][0].vx);
345 destr_01 = VEC_MADD(vpr_1, vu_mr[1][0].vx, destr_01);
346 destr_01 = VEC_MADD(vpr_2, vu_mr[2][0].vx, destr_01);
347 destr_01 = VEC_MADD(vpr_3, vu_mr[3][0].vx, destr_01);
348 destr_23 = VEC_MULT(vpr_0, vu_mr[0][1].vx);
349 destr_23 = VEC_MADD(vpr_1, vu_mr[1][1].vx, destr_23);
350 destr_23 = VEC_MADD(vpr_2, vu_mr[2][1].vx, destr_23);
351 destr_23 = VEC_MADD(vpr_3, vu_mr[3][1].vx, destr_23);
358 a = VEC_MULT(vpq_0, vu_mq[0][0].vx);
359 b = VEC_MULT(vpq_2, vu_mq[2][0].vx);
360 c = VEC_MULT(vpq_0, vu_mq[0][1].vx);
361 d = VEC_MULT(vpq_2, vu_mq[2][1].vx);
362 a = VEC_MADD(vpq_1, vu_mq[1][0].vx, a);
363 b = VEC_MADD(vpq_3, vu_mq[3][0].vx, b);
364 c = VEC_MADD(vpq_1, vu_mq[1][1].vx, c);
365 d = VEC_MADD(vpq_3, vu_mq[3][1].vx, d);
366 destq_01 = VEC_ADD(a, b);
367 destq_23 = VEC_ADD(c, d);
369 a = VEC_MULT(vpr_0, vu_mr[0][0].vx);
370 b = VEC_MULT(vpr_2, vu_mr[2][0].vx);
371 c = VEC_MULT(vpr_0, vu_mr[0][1].vx);
372 d = VEC_MULT(vpr_2, vu_mr[2][1].vx);
373 a = VEC_MADD(vpr_1, vu_mr[1][0].vx, a);
374 b = VEC_MADD(vpr_3, vu_mr[3][0].vx, b);
375 c = VEC_MADD(vpr_1, vu_mr[1][1].vx, c);
376 d = VEC_MADD(vpr_3, vu_mr[3][1].vx, d);
377 destr_01 = VEC_ADD(a, b);
378 destr_23 = VEC_ADD(c, d);
383 destPvec[0] = VEC_MULT(destq_01, destr_01);
384 destPvec[1] = VEC_MULT(destq_23, destr_23);
391 VEC_STORE(destP + v + 0,VEC_MULT(destq_01, destr_01));
392 VEC_STORE(destP + v + 2,VEC_MULT(destq_23, destr_23));
399 if (kExtraPatterns) {
400 destPvec += kExtraPatterns * 2;
401 v += kExtraPatterns * 4;
406 BEAGLE_CPU_4_SSE_TEMPLATE
407 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsFixedScaling(
float* destP,
408 const float* child0Partials,
409 const float* child0TransMat,
410 const float* child1Partials,
411 const float* child1TransMat,
412 const float* scaleFactors) {
414 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsFixedScaling(
423 BEAGLE_CPU_4_SSE_TEMPLATE
424 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartialsFixedScaling(
double* destP,
425 const double* partials_q,
426 const double* matrices_q,
427 const double* partials_r,
428 const double* matrices_r,
429 const double* scaleFactors) {
434 V_Real destq_01, destq_23, destr_01, destr_23;
435 VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
436 V_Real *destPvec = (V_Real *)destP;
438 for (
int l = 0; l < kCategoryCount; l++) {
441 SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
443 for (
int k = 0; k < kPatternCount; k++) {
445 # if 1 && !defined(_WIN32)
446 __builtin_prefetch (&partials_q[v+64]);
447 __builtin_prefetch (&partials_r[v+64]);
454 const V_Real scaleFactor = VEC_SPLAT(scaleFactors[k]);
456 V_Real vpq_0, vpq_1, vpq_2, vpq_3;
457 SSE_PREFETCH_PARTIALS(vpq_,partials_q,v);
459 V_Real vpr_0, vpr_1, vpr_2, vpr_3;
460 SSE_PREFETCH_PARTIALS(vpr_,partials_r,v);
463 destq_01 = VEC_MULT(vpq_0, vu_mq[0][0].vx);
464 destq_01 = VEC_MADD(vpq_1, vu_mq[1][0].vx, destq_01);
465 destq_01 = VEC_MADD(vpq_2, vu_mq[2][0].vx, destq_01);
466 destq_01 = VEC_MADD(vpq_3, vu_mq[3][0].vx, destq_01);
467 destq_23 = VEC_MULT(vpq_0, vu_mq[0][1].vx);
468 destq_23 = VEC_MADD(vpq_1, vu_mq[1][1].vx, destq_23);
469 destq_23 = VEC_MADD(vpq_2, vu_mq[2][1].vx, destq_23);
470 destq_23 = VEC_MADD(vpq_3, vu_mq[3][1].vx, destq_23);
472 destr_01 = VEC_MULT(vpr_0, vu_mr[0][0].vx);
473 destr_01 = VEC_MADD(vpr_1, vu_mr[1][0].vx, destr_01);
474 destr_01 = VEC_MADD(vpr_2, vu_mr[2][0].vx, destr_01);
475 destr_01 = VEC_MADD(vpr_3, vu_mr[3][0].vx, destr_01);
476 destr_23 = VEC_MULT(vpr_0, vu_mr[0][1].vx);
477 destr_23 = VEC_MADD(vpr_1, vu_mr[1][1].vx, destr_23);
478 destr_23 = VEC_MADD(vpr_2, vu_mr[2][1].vx, destr_23);
479 destr_23 = VEC_MADD(vpr_3, vu_mr[3][1].vx, destr_23);
481 destPvec[0] = VEC_DIV(VEC_MULT(destq_01, destr_01), scaleFactor);
482 destPvec[1] = VEC_DIV(VEC_MULT(destq_23, destr_23), scaleFactor);
488 if (kExtraPatterns) {
489 destPvec += kExtraPatterns * 2;
490 v += kExtraPatterns * 4;
496 BEAGLE_CPU_4_SSE_TEMPLATE
497 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsAutoScaling(
float* destP,
498 const float* partials_q,
499 const float* matrices_q,
500 const float* partials_r,
501 const float* matrices_r,
502 int* activateScaling) {
503 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsAutoScaling(destP,
511 BEAGLE_CPU_4_SSE_TEMPLATE
512 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(
double* destP,
513 const double* partials_q,
514 const double* matrices_q,
515 const double* partials_r,
516 const double* matrices_r,
517 int* activateScaling) {
519 BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(destP,
527 BEAGLE_CPU_4_SSE_TEMPLATE
528 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcEdgeLogLikelihoods(
const int parIndex,
529 const int childIndex,
531 const int categoryWeightsIndex,
532 const int stateFrequenciesIndex,
533 const int scalingFactorsIndex,
534 double* outSumLogLikelihood) {
535 return BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcEdgeLogLikelihoods(
539 categoryWeightsIndex,
540 stateFrequenciesIndex,
542 outSumLogLikelihood);
545 BEAGLE_CPU_4_SSE_TEMPLATE
546 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcEdgeLogLikelihoods(
const int parIndex,
547 const int childIndex,
549 const int categoryWeightsIndex,
550 const int stateFrequenciesIndex,
551 const int scalingFactorsIndex,
552 double* outSumLogLikelihood) {
555 int returnCode = BEAGLE_SUCCESS;
557 assert(parIndex >= kTipCount);
559 const double* cl_r = gPartials[parIndex];
560 double* cl_p = integrationTmp;
561 const double* transMatrix = gTransitionMatrices[probIndex];
562 const double* wt = gCategoryWeights[categoryWeightsIndex];
563 const double* freqs = gStateFrequencies[stateFrequenciesIndex];
565 memset(cl_p, 0, (kPatternCount * kStateCount)*
sizeof(
double));
567 if (childIndex < kTipCount && gTipStates[childIndex]) {
569 const int* statesChild = gTipStates[childIndex];
572 V_Real *vcl_r = (V_Real *)cl_r;
573 for(
int l = 0; l < kCategoryCount; l++) {
576 SSE_PREFETCH_MATRIX(transMatrix + w, vu_m)
578 V_Real *vcl_p = (V_Real *)cl_p;
580 for(
int k = 0; k < kPatternCount; k++) {
582 const int stateChild = statesChild[k];
583 V_Real vwt = VEC_SPLAT(wt[l]);
585 V_Real wtdPartials = VEC_MULT(*vcl_r++, vwt);
586 *vcl_p++ = VEC_MADD(vu_m[stateChild][0].vx, wtdPartials, *vcl_p);
588 wtdPartials = VEC_MULT(*vcl_r++, vwt);
589 *vcl_p++ = VEC_MADD(vu_m[stateChild][1].vx, wtdPartials, *vcl_p);
592 vcl_r += 2 * kExtraPatterns;
596 const double* cl_q = gPartials[childIndex];
597 V_Real * vcl_r = (V_Real *)cl_r;
601 for(
int l = 0; l < kCategoryCount; l++) {
603 V_Real * vcl_p = (V_Real *)cl_p;
606 SSE_PREFETCH_MATRIX(transMatrix + w, vu_m)
608 for(
int k = 0; k < kPatternCount; k++) {
609 V_Real vclp_01, vclp_23;
610 V_Real vwt = VEC_SPLAT(wt[l]);
612 V_Real vcl_q0, vcl_q1, vcl_q2, vcl_q3;
613 SSE_PREFETCH_PARTIALS(vcl_q,cl_q,v);
615 vclp_01 = VEC_MULT(vcl_q0, vu_m[0][0].vx);
616 vclp_01 = VEC_MADD(vcl_q1, vu_m[1][0].vx, vclp_01);
617 vclp_01 = VEC_MADD(vcl_q2, vu_m[2][0].vx, vclp_01);
618 vclp_01 = VEC_MADD(vcl_q3, vu_m[3][0].vx, vclp_01);
619 vclp_23 = VEC_MULT(vcl_q0, vu_m[0][1].vx);
620 vclp_23 = VEC_MADD(vcl_q1, vu_m[1][1].vx, vclp_23);
621 vclp_23 = VEC_MADD(vcl_q2, vu_m[2][1].vx, vclp_23);
622 vclp_23 = VEC_MADD(vcl_q3, vu_m[3][1].vx, vclp_23);
623 vclp_01 = VEC_MULT(vclp_01, vwt);
624 vclp_23 = VEC_MULT(vclp_23, vwt);
626 *vcl_p++ = VEC_MADD(vclp_01, *vcl_r++, *vcl_p);
627 *vcl_p++ = VEC_MADD(vclp_23, *vcl_r++, *vcl_p);
632 if (kExtraPatterns) {
633 vcl_r += 2 * kExtraPatterns;
634 v += 4 * kExtraPatterns;
641 for(
int k = 0; k < kPatternCount; k++) {
642 double sumOverI = 0.0;
643 for(
int i = 0; i < kStateCount; i++) {
644 sumOverI += freqs[i] * cl_p[u];
648 outLogLikelihoodsTmp[k] = log(sumOverI);
652 if (scalingFactorsIndex != BEAGLE_OP_NONE) {
653 const double* scalingFactors = gScaleBuffers[scalingFactorsIndex];
654 for(
int k=0; k < kPatternCount; k++)
655 outLogLikelihoodsTmp[k] += scalingFactors[k];
658 *outSumLogLikelihood = 0.0;
659 for (
int i = 0; i < kPatternCount; i++) {
660 *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
663 if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
664 returnCode = BEAGLE_ERROR_FLOATING_POINT;
670 BEAGLE_CPU_4_SSE_TEMPLATE
671 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::getPaddedPatternsModulus() {
677 BEAGLE_CPU_4_SSE_TEMPLATE
678 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::getPaddedPatternsModulus() {
683 BEAGLE_CPU_4_SSE_TEMPLATE
684 const char* BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::getName() {
685 return getBeagleCPU4StateSSEName<float>();
688 BEAGLE_CPU_4_SSE_TEMPLATE
689 const char* BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::getName() {
690 return getBeagleCPU4StateSSEName<double>();
694 BEAGLE_CPU_4_SSE_TEMPLATE
695 const long BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::getFlags() {
696 return BEAGLE_FLAG_COMPUTATION_SYNCH |
697 BEAGLE_FLAG_THREADING_NONE |
698 BEAGLE_FLAG_PROCESSOR_CPU |
699 BEAGLE_FLAG_PRECISION_SINGLE |
700 BEAGLE_FLAG_VECTOR_SSE;
703 BEAGLE_CPU_4_SSE_TEMPLATE
704 const long BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::getFlags() {
705 return BEAGLE_FLAG_COMPUTATION_SYNCH |
706 BEAGLE_FLAG_THREADING_NONE |
707 BEAGLE_FLAG_PROCESSOR_CPU |
708 BEAGLE_FLAG_PRECISION_DOUBLE |
709 BEAGLE_FLAG_VECTOR_SSE;
717 BEAGLE_CPU_FACTORY_TEMPLATE
718 BeagleImpl* BeagleCPU4StateSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(
int tipCount,
719 int partialsBufferCount,
720 int compactBufferCount,
723 int eigenBufferCount,
724 int matrixBufferCount,
726 int scaleBufferCount,
728 long preferenceFlags,
729 long requirementFlags,
732 if (stateCount != 4) {
736 BeagleCPU4StateSSEImpl<REALTYPE, T_PAD_4_SSE_DEFAULT, P_PAD_4_SSE_DEFAULT>* impl =
737 new BeagleCPU4StateSSEImpl<REALTYPE, T_PAD_4_SSE_DEFAULT, P_PAD_4_SSE_DEFAULT>();
739 if (!CPUSupportsSSE()) {
745 if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
746 patternCount, eigenBufferCount, matrixBufferCount,
747 categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags) == 0)
751 if (DEBUGGING_OUTPUT)
752 std::cerr <<
"exception in initialize\n";
762 BEAGLE_CPU_FACTORY_TEMPLATE
763 const char* BeagleCPU4StateSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() {
764 return getBeagleCPU4StateSSEName<BEAGLE_CPU_FACTORY_GENERIC>();
768 const long BeagleCPU4StateSSEImplFactory<double>::getFlags() {
769 return BEAGLE_FLAG_COMPUTATION_SYNCH |
770 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
771 BEAGLE_FLAG_THREADING_NONE |
772 BEAGLE_FLAG_PROCESSOR_CPU |
773 BEAGLE_FLAG_VECTOR_SSE |
774 BEAGLE_FLAG_PRECISION_DOUBLE |
775 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
776 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL|
777 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
781 const long BeagleCPU4StateSSEImplFactory<float>::getFlags() {
782 return BEAGLE_FLAG_COMPUTATION_SYNCH |
783 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
784 BEAGLE_FLAG_THREADING_NONE |
785 BEAGLE_FLAG_PROCESSOR_CPU |
786 BEAGLE_FLAG_VECTOR_SSE |
787 BEAGLE_FLAG_PRECISION_SINGLE |
788 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
789 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
790 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
797 #endif //BEAGLE_CPU_4STATE_SSE_IMPL_HPP