27 #ifndef BEAGLE_CPU_SSE_IMPL_HPP
28 #define BEAGLE_CPU_SSE_IMPL_HPP
32 #include "libhmsbeagle/config.h"
42 #include "libhmsbeagle/beagle.h"
43 #include "libhmsbeagle/CPU/BeagleCPUImpl.h"
44 #include "libhmsbeagle/CPU/BeagleCPUSSEImpl.h"
45 #include "libhmsbeagle/CPU/SSEDefinitions.h"
50 BEAGLE_CPU_FACTORY_TEMPLATE
51 inline const char* getBeagleCPUSSEName(){
return "CPU-SSE-Unknown"; };
54 inline const char* getBeagleCPUSSEName<double>(){
return "CPU-SSE-Double"; };
57 inline const char* getBeagleCPUSSEName<float>(){
return "CPU-SSE-Single"; };
63 BEAGLE_CPU_SSE_TEMPLATE
64 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesStates(
double* destP,
66 const double* matrices_q,
68 const double* matrices_r) {
70 BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesStates(destP,
114 BEAGLE_CPU_SSE_TEMPLATE
115 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesPartials(
double* destP,
117 const double* matrices_q,
118 const double* partials_r,
119 const double* matrices_r) {
120 BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesPartials(
209 BEAGLE_CPU_SSE_TEMPLATE
210 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartials(
double* __restrict destP,
211 const double* __restrict partials1,
212 const double* __restrict matrices1,
213 const double* __restrict partials2,
214 const double* __restrict matrices2) {
215 int stateCountMinusOne = kPartialsPaddedStateCount - 1;
216 #pragma omp parallel for num_threads(kCategoryCount)
218 double* destPu = destP + l*kPartialsPaddedStateCount*
kPatternCount;
229 register V_Real sum1_vecA = VEC_SETZERO();
230 register V_Real sum2_vecA = VEC_SETZERO();
231 for (
int j = 0; j < stateCountMinusOne; j += 2) {
232 sum1_vecA = VEC_MADD(
233 VEC_LOAD(matrices1 + w + j),
234 VEC_LOAD(partials1 + v + j),
236 sum2_vecA = VEC_MADD(
237 VEC_LOAD(matrices2 + w + j),
238 VEC_LOAD(partials2 + v + j),
242 sum1_vecA = VEC_MULT(
243 VEC_ADD(sum1_vecA, VEC_SWAP(sum1_vecA)),
244 VEC_ADD(sum2_vecA, VEC_SWAP(sum2_vecA))
248 w += kStateCount + T_PAD;
250 #ifndef DOUBLE_UNROLL
252 VEC_STORE_SCALAR(destPu, sum1_vecA);
257 register V_Real sum1_vecB = VEC_SETZERO();
258 register V_Real sum2_vecB = VEC_SETZERO();
259 for (
int j = 0; j < stateCountMinusOne; j += 2) {
260 sum1_vecB = VEC_MADD(
261 VEC_LOAD(matrices1 + w + j),
262 VEC_LOAD(partials1 + v + j),
264 sum2_vecB = VEC_MADD(
265 VEC_LOAD(matrices2 + w + j),
266 VEC_LOAD(partials2 + v + j),
270 sum1_vecB = VEC_MULT(
271 VEC_ADD(sum1_vecB, VEC_SWAP(sum1_vecB)),
272 VEC_ADD(sum2_vecB, VEC_SWAP(sum2_vecB))
276 w += kStateCount + T_PAD;
279 VEC_STORE(destPu, VEC_MOVE(sum1_vecA, sum1_vecB));
285 v += kPartialsPaddedStateCount;
290 BEAGLE_CPU_SSE_TEMPLATE
291 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartialsFixedScaling(
292 double* __restrict destP,
293 const double* __restrict partials1,
294 const double* __restrict matrices1,
295 const double* __restrict partials2,
296 const double* __restrict matrices2,
297 const double* __restrict scaleFactors) {
298 int stateCountMinusOne = kPartialsPaddedStateCount - 1;
299 #pragma omp parallel for num_threads(kCategoryCount)
301 double* destPu = destP + l*kPartialsPaddedStateCount*
kPatternCount;
305 const V_Real scalar = VEC_SPLAT(scaleFactors[k]);
308 register V_Real sum1_vec;
309 register V_Real sum2_vec;
312 sum1_vec = VEC_SETZERO();
313 sum2_vec = VEC_SETZERO();
314 for ( ; j < stateCountMinusOne; j += 2) {
316 VEC_LOAD(matrices1 + w + j),
317 VEC_LOAD(partials1 + v + j),
320 VEC_LOAD(matrices2 + w + j),
321 VEC_LOAD(partials2 + v + j),
324 VEC_STORE_SCALAR(destPu,
326 VEC_ADD(sum1_vec, VEC_SWAP(sum1_vec)),
327 VEC_ADD(sum2_vec, VEC_SWAP(sum2_vec))
332 w += kStateCount + T_PAD;
337 v += kPartialsPaddedStateCount;
343 BEAGLE_CPU_SSE_TEMPLATE
344 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(
double* destP,
345 const double* partials_q,
346 const double* matrices_q,
347 const double* partials_r,
348 const double* matrices_r,
349 int* activateScaling) {
350 BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(destP,
374 BEAGLE_CPU_SSE_TEMPLATE
375 int BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcEdgeLogLikelihoods(
const int parIndex,
376 const int childIndex,
378 const int categoryWeightsIndex,
379 const int stateFrequenciesIndex,
380 const int scalingFactorsIndex,
381 double* outSumLogLikelihood) {
382 return BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcEdgeLogLikelihoods(
386 categoryWeightsIndex,
387 stateFrequenciesIndex,
389 outSumLogLikelihood);
516 BEAGLE_CPU_SSE_TEMPLATE
517 int BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::getPaddedPatternsModulus() {
521 BEAGLE_CPU_SSE_TEMPLATE
522 const char* BeagleCPUSSEImpl<BEAGLE_CPU_SSE_FLOAT>::getName() {
523 return getBeagleCPUSSEName<float>();
526 BEAGLE_CPU_SSE_TEMPLATE
527 const char* BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::getName() {
528 return getBeagleCPUSSEName<double>();
531 BEAGLE_CPU_SSE_TEMPLATE
532 const long BeagleCPUSSEImpl<BEAGLE_CPU_SSE_FLOAT>::getFlags() {
533 return BEAGLE_FLAG_COMPUTATION_SYNCH |
534 BEAGLE_FLAG_THREADING_NONE |
535 BEAGLE_FLAG_PROCESSOR_CPU |
536 BEAGLE_FLAG_PRECISION_SINGLE |
537 BEAGLE_FLAG_VECTOR_SSE;
540 BEAGLE_CPU_SSE_TEMPLATE
541 const long BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::getFlags() {
542 return BEAGLE_FLAG_COMPUTATION_SYNCH |
543 BEAGLE_FLAG_THREADING_NONE |
544 BEAGLE_FLAG_PROCESSOR_CPU |
545 BEAGLE_FLAG_PRECISION_DOUBLE |
546 BEAGLE_FLAG_VECTOR_SSE;
553 BEAGLE_CPU_FACTORY_TEMPLATE
554 BeagleImpl* BeagleCPUSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(
int tipCount,
555 int partialsBufferCount,
556 int compactBufferCount,
559 int eigenBufferCount,
560 int matrixBufferCount,
562 int scaleBufferCount,
564 long preferenceFlags,
565 long requirementFlags,
568 if (!CPUSupportsSSE())
571 if (stateCount & 1) {
572 BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_ODD, P_PAD_SSE_ODD>* impl =
573 new BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_ODD, P_PAD_SSE_ODD>();
577 if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
578 patternCount, eigenBufferCount, matrixBufferCount,
579 categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags) == 0)
583 if (DEBUGGING_OUTPUT)
584 std::cerr <<
"exception in initialize\n";
591 BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_EVEN, P_PAD_SSE_EVEN>* impl =
592 new BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_EVEN, P_PAD_SSE_EVEN>();
596 if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
597 patternCount, eigenBufferCount, matrixBufferCount,
598 categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags) == 0)
602 if (DEBUGGING_OUTPUT)
603 std::cerr <<
"exception in initialize\n";
614 BEAGLE_CPU_FACTORY_TEMPLATE
615 const char* BeagleCPUSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() {
616 return getBeagleCPUSSEName<BEAGLE_CPU_FACTORY_GENERIC>();
620 const long BeagleCPUSSEImplFactory<double>::getFlags() {
621 return BEAGLE_FLAG_COMPUTATION_SYNCH |
622 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
623 BEAGLE_FLAG_THREADING_NONE |
624 BEAGLE_FLAG_PROCESSOR_CPU |
625 BEAGLE_FLAG_VECTOR_SSE |
626 BEAGLE_FLAG_PRECISION_DOUBLE |
627 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
628 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
629 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
633 const long BeagleCPUSSEImplFactory<float>::getFlags() {
634 return BEAGLE_FLAG_COMPUTATION_SYNCH |
635 BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
636 BEAGLE_FLAG_THREADING_NONE |
637 BEAGLE_FLAG_PROCESSOR_CPU |
638 BEAGLE_FLAG_VECTOR_SSE |
639 BEAGLE_FLAG_PRECISION_SINGLE |
640 BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
641 BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
642 BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
691 #endif //BEAGLE_CPU_SSE_IMPL_HPP