HMSBEAGLE  1.0.0
BeagleCPUSSEImpl.hpp
1 /*
2  * BeagleCPUSSEImpl.hpp
3  * BEAGLE
4  *
5  * Copyright 2010 Phylogenetic Likelihood Working Group
6  *
7  * This file is part of BEAGLE.
8  *
9  * BEAGLE is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser General Public License as
11  * published by the Free Software Foundation, either version 3 of
12  * the License, or (at your option) any later version.
13  *
14  * BEAGLE is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with BEAGLE. If not, see
21  * <http://www.gnu.org/licenses/>.
22  *
23  * @author Marc Suchard
24  * @author David Swofford
25  */
26 
27 #ifndef BEAGLE_CPU_SSE_IMPL_HPP
28 #define BEAGLE_CPU_SSE_IMPL_HPP
29 
30 
31 #ifdef HAVE_CONFIG_H
32 #include "libhmsbeagle/config.h"
33 #endif
34 
35 #include <cstdio>
36 #include <cstdlib>
37 #include <iostream>
38 #include <cstring>
39 #include <cmath>
40 #include <cassert>
41 
42 #include "libhmsbeagle/beagle.h"
43 #include "libhmsbeagle/CPU/BeagleCPUImpl.h"
44 #include "libhmsbeagle/CPU/BeagleCPUSSEImpl.h"
45 #include "libhmsbeagle/CPU/SSEDefinitions.h"
46 
47 namespace beagle {
48 namespace cpu {
49 
50 BEAGLE_CPU_FACTORY_TEMPLATE
51 inline const char* getBeagleCPUSSEName(){ return "CPU-SSE-Unknown"; };
52 
53 template<>
54 inline const char* getBeagleCPUSSEName<double>(){ return "CPU-SSE-Double"; };
55 
56 template<>
57 inline const char* getBeagleCPUSSEName<float>(){ return "CPU-SSE-Single"; };
58 
59 /*
60  * Calculates partial likelihoods at a node when both children have states.
61  */
62 
63 BEAGLE_CPU_SSE_TEMPLATE
64 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesStates(double* destP,
65  const int* states_q,
66  const double* matrices_q,
67  const int* states_r,
68  const double* matrices_r) {
69 
70  BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesStates(destP,
71  states_q,
72  matrices_q,
73  states_r,
74  matrices_r);
75 }
76 
77 
78 //template <>
79 //void BeagleCPUSSEImpl<double>::calcStatesStates(double* destP,
80 // const int* states_q,
81 // const double* matrices_q,
82 // const int* states_r,
83 // const double* matrices_r) {
84 //
85 // VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
86 //
87 // int w = 0;
88 // V_Real *destPvec = (V_Real *)destP;
89 //
90 // for (int l = 0; l < kCategoryCount; l++) {
91 //
92 // SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
93 //
94 // for (int k = 0; k < kPatternCount; k++) {
95 //
96 // const int state_q = states_q[k];
97 // const int state_r = states_r[k];
98 //
99 // *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, vu_mr[state_r][0].vx);
100 // *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, vu_mr[state_r][1].vx);
101 //
102 // }
103 //
104 // w += OFFSET*4;
105 // if (kExtraPatterns)
106 // destPvec += kExtraPatterns * 2;
107 // }
108 //}
109 
110 /*
111  * Calculates partial likelihoods at a node when one child has states and one has partials.
112  SSE version
113  */
114 BEAGLE_CPU_SSE_TEMPLATE
115 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesPartials(double* destP,
116  const int* states_q,
117  const double* matrices_q,
118  const double* partials_r,
119  const double* matrices_r) {
120  BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcStatesPartials(
121  destP,
122  states_q,
123  matrices_q,
124  partials_r,
125  matrices_r);
126 }
127 
128 //
129 //
130 //template <>
131 //void BeagleCPUSSEImpl<double>::calcStatesPartials(double* destP,
132 // const int* states_q,
133 // const double* matrices_q,
134 // const double* partials_r,
135 // const double* matrices_r) {
136 //
137 // int v = 0;
138 // int w = 0;
139 //
140 // VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
141 // V_Real *destPvec = (V_Real *)destP;
142 // V_Real destr_01, destr_23;
143 //
144 // for (int l = 0; l < kCategoryCount; l++) {
145 //
146 // SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
147 //
148 // for (int k = 0; k < kPatternCount; k++) {
149 //
150 // const int state_q = states_q[k];
151 // V_Real vp0, vp1, vp2, vp3;
152 // SSE_PREFETCH_PARTIALS(vp,partials_r,v);
153 //
154 // destr_01 = VEC_MULT(vp0, vu_mr[0][0].vx);
155 // destr_01 = VEC_MADD(vp1, vu_mr[1][0].vx, destr_01);
156 // destr_01 = VEC_MADD(vp2, vu_mr[2][0].vx, destr_01);
157 // destr_01 = VEC_MADD(vp3, vu_mr[3][0].vx, destr_01);
158 // destr_23 = VEC_MULT(vp0, vu_mr[0][1].vx);
159 // destr_23 = VEC_MADD(vp1, vu_mr[1][1].vx, destr_23);
160 // destr_23 = VEC_MADD(vp2, vu_mr[2][1].vx, destr_23);
161 // destr_23 = VEC_MADD(vp3, vu_mr[3][1].vx, destr_23);
162 //
163 // *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, destr_01);
164 // *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, destr_23);
165 //
166 // v += 4;
167 // }
168 // w += OFFSET*4;
169 // if (kExtraPatterns) {
170 // destPvec += kExtraPatterns * 2;
171 // v += kExtraPatterns * 4;
172 // }
173 // }
174 //}
175 
176 //template<>
177 //void inline BeagleCPUSSEImpl<double>::innerPartialsPartals(
178 // const double* __restrict partials1,
179 // const double* __restrict matrices1,
180 // const double* __restrict partials2,
181 // const double* __restrict matrices2,
182 // V_Real& sum1_vec,
183 // V_Real& sum2_vec,
184 // V_Real& out,
185 // int& v,
186 // int& w) {
187 // int j = 0;
188 // sum1_vec = VEC_SETZERO();
189 // sum2_vec = VEC_SETZERO();
190 // for (; j < kStateCountMinusOne; j += 2) {
191 // sum1_vec = VEC_MADD(
192 // VEC_LOAD(matrices1 + w + j), // TODO This only works if w is even
193 // VEC_LOAD(partials1 + v + j), // TODO This only works if v is even
194 // sum1_vec);
195 // sum2_vec = VEC_MADD(
196 // VEC_LOAD(matrices2 + w + j),
197 // VEC_LOAD(partials2 + v + j),
198 // sum2_vec);
199 // }
200 //
201 // out = VEC_MULT(
202 // VEC_ADD(sum1_vec, VEC_SWAP(sum1_vec)),
203 // VEC_ADD(sum2_vec, VEC_SWAP(sum2_vec))
204 // );
205 //}
206 
207 //#define DOUBLE_UNROLL // Does not appear to save any time
208 
209 BEAGLE_CPU_SSE_TEMPLATE
210 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartials(double* __restrict destP,
211  const double* __restrict partials1,
212  const double* __restrict matrices1,
213  const double* __restrict partials2,
214  const double* __restrict matrices2) {
215  int stateCountMinusOne = kPartialsPaddedStateCount - 1;
216 #pragma omp parallel for num_threads(kCategoryCount)
217  for (int l = 0; l < kCategoryCount; l++) {
218  double* destPu = destP + l*kPartialsPaddedStateCount*kPatternCount;
219  int v = l*kPartialsPaddedStateCount*kPatternCount;
220  for (int k = 0; k < kPatternCount; k++) {
221  int w = l * kMatrixSize;
222  for (int i = 0; i < kStateCount;
223 #ifdef DOUBLE_UNROLL
224  i += 2 // TODO This only works if stateCount is even
225 #else
226  ++i
227 #endif
228  ) {
229  register V_Real sum1_vecA = VEC_SETZERO();
230  register V_Real sum2_vecA = VEC_SETZERO();
231  for (int j = 0; j < stateCountMinusOne; j += 2) {
232  sum1_vecA = VEC_MADD(
233  VEC_LOAD(matrices1 + w + j), // TODO This only works if w is even
234  VEC_LOAD(partials1 + v + j), // TODO This only works if v is even
235  sum1_vecA);
236  sum2_vecA = VEC_MADD(
237  VEC_LOAD(matrices2 + w + j),
238  VEC_LOAD(partials2 + v + j),
239  sum2_vecA);
240  }
241 
242  sum1_vecA = VEC_MULT(
243  VEC_ADD(sum1_vecA, VEC_SWAP(sum1_vecA)),
244  VEC_ADD(sum2_vecA, VEC_SWAP(sum2_vecA))
245  );
246 
247  // increment for the extra column at the end
248  w += kStateCount + T_PAD;
249 
250 #ifndef DOUBLE_UNROLL
251  // Store single value
252  VEC_STORE_SCALAR(destPu, sum1_vecA);
253  destPu++;
254 #endif
255 
256 #ifdef DOUBLE_UNROLL
257  register V_Real sum1_vecB = VEC_SETZERO();
258  register V_Real sum2_vecB = VEC_SETZERO();
259  for (int j = 0; j < stateCountMinusOne; j += 2) {
260  sum1_vecB = VEC_MADD(
261  VEC_LOAD(matrices1 + w + j), // TODO This only works if w is even
262  VEC_LOAD(partials1 + v + j), // TODO This only works if v is even
263  sum1_vecB);
264  sum2_vecB = VEC_MADD(
265  VEC_LOAD(matrices2 + w + j),
266  VEC_LOAD(partials2 + v + j),
267  sum2_vecB);
268  }
269 
270  sum1_vecB = VEC_MULT(
271  VEC_ADD(sum1_vecB, VEC_SWAP(sum1_vecB)),
272  VEC_ADD(sum2_vecB, VEC_SWAP(sum2_vecB))
273  );
274 
275  // increment for the extra column at the end
276  w += kStateCount + T_PAD;
277 
278  // Store both partials in one transaction
279  VEC_STORE(destPu, VEC_MOVE(sum1_vecA, sum1_vecB));
280  destPu += 2;
281 #endif
282 
283  }
284  destPu += P_PAD;
285  v += kPartialsPaddedStateCount;
286  }
287  }
288 }
289 
290 BEAGLE_CPU_SSE_TEMPLATE
291 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartialsFixedScaling(
292  double* __restrict destP,
293  const double* __restrict partials1,
294  const double* __restrict matrices1,
295  const double* __restrict partials2,
296  const double* __restrict matrices2,
297  const double* __restrict scaleFactors) {
298  int stateCountMinusOne = kPartialsPaddedStateCount - 1;
299 #pragma omp parallel for num_threads(kCategoryCount)
300  for (int l = 0; l < kCategoryCount; l++) {
301  double* destPu = destP + l*kPartialsPaddedStateCount*kPatternCount;
302  int v = l*kPartialsPaddedStateCount*kPatternCount;
303  for (int k = 0; k < kPatternCount; k++) {
304  int w = l * kMatrixSize;
305  const V_Real scalar = VEC_SPLAT(scaleFactors[k]);
306  for (int i = 0; i < kStateCount; i++) {
307 
308  register V_Real sum1_vec;
309  register V_Real sum2_vec;
310 
311  int j = 0;
312  sum1_vec = VEC_SETZERO();
313  sum2_vec = VEC_SETZERO();
314  for ( ; j < stateCountMinusOne; j += 2) {
315  sum1_vec = VEC_MADD(
316  VEC_LOAD(matrices1 + w + j), // TODO This only works if w is even
317  VEC_LOAD(partials1 + v + j), // TODO This only works if v is even
318  sum1_vec);
319  sum2_vec = VEC_MADD(
320  VEC_LOAD(matrices2 + w + j),
321  VEC_LOAD(partials2 + v + j),
322  sum2_vec);
323  }
324  VEC_STORE_SCALAR(destPu,
325  VEC_DIV(VEC_MULT(
326  VEC_ADD(sum1_vec, VEC_SWAP(sum1_vec)),
327  VEC_ADD(sum2_vec, VEC_SWAP(sum2_vec))
328  ), scalar));
329 
330 
331  // increment for the extra column at the end
332  w += kStateCount + T_PAD;
333 
334  destPu++;
335  }
336  destPu += P_PAD;
337  v += kPartialsPaddedStateCount;
338  }
339  }
340 }
341 
342 
343 BEAGLE_CPU_SSE_TEMPLATE
344 void BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(double* destP,
345  const double* partials_q,
346  const double* matrices_q,
347  const double* partials_r,
348  const double* matrices_r,
349  int* activateScaling) {
350  BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(destP,
351  partials_q,
352  matrices_q,
353  partials_r,
354  matrices_r,
355  activateScaling);
356 }
357 
358 //template <>
359 //void BeagleCPUSSEImpl<double>::calcPartialsPartialsAutoScaling(double* destP,
360 // const double* partials_q,
361 // const double* matrices_q,
362 // const double* partials_r,
363 // const double* matrices_r,
364 // int* activateScaling) {
365 // // TODO: implement calcPartialsPartialsAutoScaling with SSE
366 // BeagleCPUImpl<double>::calcPartialsPartialsAutoScaling(destP,
367 // partials_q,
368 // matrices_q,
369 // partials_r,
370 // matrices_r,
371 // activateScaling);
372 //}
373 
374 BEAGLE_CPU_SSE_TEMPLATE
375 int BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::calcEdgeLogLikelihoods(const int parIndex,
376  const int childIndex,
377  const int probIndex,
378  const int categoryWeightsIndex,
379  const int stateFrequenciesIndex,
380  const int scalingFactorsIndex,
381  double* outSumLogLikelihood) {
382 return BeagleCPUImpl<BEAGLE_CPU_SSE_DOUBLE>::calcEdgeLogLikelihoods(
383  parIndex,
384  childIndex,
385  probIndex,
386  categoryWeightsIndex,
387  stateFrequenciesIndex,
388  scalingFactorsIndex,
389  outSumLogLikelihood);
390 }
391 
392 //template <>
393 // int BeagleCPUSSEImpl<double>::calcEdgeLogLikelihoods(const int parIndex,
394 // const int childIndex,
395 // const int probIndex,
396 // const int categoryWeightsIndex,
397 // const int stateFrequenciesIndex,
398 // const int scalingFactorsIndex,
399 // double* outSumLogLikelihood) {
400 // // TODO: implement derivatives for calculateEdgeLnL
401 //
402 // int returnCode = BEAGLE_SUCCESS;
403 //
404 // assert(parIndex >= kTipCount);
405 //
406 // const double* cl_r = gPartials[parIndex];
407 // double* cl_p = integrationTmp;
408 // const double* transMatrix = gTransitionMatrices[probIndex];
409 // const double* wt = gCategoryWeights[categoryWeightsIndex];
410 // const double* freqs = gStateFrequencies[stateFrequenciesIndex];
411 //
412 // memset(cl_p, 0, (kPatternCount * kStateCount)*sizeof(double));
413 //
414 // if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
415 //
416 // const int* statesChild = gTipStates[childIndex];
417 //
418 // int w = 0;
419 // V_Real *vcl_r = (V_Real *)cl_r;
420 // for(int l = 0; l < kCategoryCount; l++) {
421 //
422 // VecUnion vu_m[OFFSET][2];
423 // SSE_PREFETCH_MATRIX(transMatrix + w, vu_m)
424 //
425 // V_Real *vcl_p = (V_Real *)cl_p;
426 //
427 // for(int k = 0; k < kPatternCount; k++) {
428 //
429 // const int stateChild = statesChild[k];
430 // V_Real vwt = VEC_SPLAT(wt[l]);
431 //
432 // V_Real wtdPartials = VEC_MULT(*vcl_r++, vwt);
433 // *vcl_p++ = VEC_MADD(vu_m[stateChild][0].vx, wtdPartials, *vcl_p);
434 //
435 // wtdPartials = VEC_MULT(*vcl_r++, vwt);
436 // *vcl_p++ = VEC_MADD(vu_m[stateChild][1].vx, wtdPartials, *vcl_p);
437 // }
438 // w += OFFSET*4;
439 // vcl_r += 2 * kExtraPatterns;
440 // }
441 // } else { // Integrate against a partial at the child
442 //
443 // const double* cl_q = gPartials[childIndex];
444 // V_Real * vcl_r = (V_Real *)cl_r;
445 // int v = 0;
446 // int w = 0;
447 //
448 // for(int l = 0; l < kCategoryCount; l++) {
449 //
450 // V_Real * vcl_p = (V_Real *)cl_p;
451 //
452 // VecUnion vu_m[OFFSET][2];
453 // SSE_PREFETCH_MATRIX(transMatrix + w, vu_m)
454 //
455 // for(int k = 0; k < kPatternCount; k++) {
456 // V_Real vclp_01, vclp_23;
457 // V_Real vwt = VEC_SPLAT(wt[l]);
458 //
459 // V_Real vcl_q0, vcl_q1, vcl_q2, vcl_q3;
460 // SSE_PREFETCH_PARTIALS(vcl_q,cl_q,v);
461 //
462 // vclp_01 = VEC_MULT(vcl_q0, vu_m[0][0].vx);
463 // vclp_01 = VEC_MADD(vcl_q1, vu_m[1][0].vx, vclp_01);
464 // vclp_01 = VEC_MADD(vcl_q2, vu_m[2][0].vx, vclp_01);
465 // vclp_01 = VEC_MADD(vcl_q3, vu_m[3][0].vx, vclp_01);
466 // vclp_23 = VEC_MULT(vcl_q0, vu_m[0][1].vx);
467 // vclp_23 = VEC_MADD(vcl_q1, vu_m[1][1].vx, vclp_23);
468 // vclp_23 = VEC_MADD(vcl_q2, vu_m[2][1].vx, vclp_23);
469 // vclp_23 = VEC_MADD(vcl_q3, vu_m[3][1].vx, vclp_23);
470 // vclp_01 = VEC_MULT(vclp_01, vwt);
471 // vclp_23 = VEC_MULT(vclp_23, vwt);
472 //
473 // *vcl_p++ = VEC_MADD(vclp_01, *vcl_r++, *vcl_p);
474 // *vcl_p++ = VEC_MADD(vclp_23, *vcl_r++, *vcl_p);
475 //
476 // v += 4;
477 // }
478 // w += 4*OFFSET;
479 // if (kExtraPatterns) {
480 // vcl_r += 2 * kExtraPatterns;
481 // v += 4 * kExtraPatterns;
482 // }
483 //
484 // }
485 // }
486 //
487 // int u = 0;
488 // for(int k = 0; k < kPatternCount; k++) {
489 // double sumOverI = 0.0;
490 // for(int i = 0; i < kStateCount; i++) {
491 // sumOverI += freqs[i] * cl_p[u];
492 // u++;
493 // }
494 //
495 // outLogLikelihoodsTmp[k] = log(sumOverI);
496 // }
497 //
498 //
499 // if (scalingFactorsIndex != BEAGLE_OP_NONE) {
500 // const double* scalingFactors = gScaleBuffers[scalingFactorsIndex];
501 // for(int k=0; k < kPatternCount; k++)
502 // outLogLikelihoodsTmp[k] += scalingFactors[k];
503 // }
504 //
505 // *outSumLogLikelihood = 0.0;
506 // for (int i = 0; i < kPatternCount; i++) {
507 // *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
508 // }
509 //
510 // if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
511 // returnCode = BEAGLE_ERROR_FLOATING_POINT;
512 //
513 // return returnCode;
514 //}
515 
516 BEAGLE_CPU_SSE_TEMPLATE
517 int BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::getPaddedPatternsModulus() {
518  return 1; // We currently do not vectorize across patterns
519 }
520 
521 BEAGLE_CPU_SSE_TEMPLATE
522 const char* BeagleCPUSSEImpl<BEAGLE_CPU_SSE_FLOAT>::getName() {
523  return getBeagleCPUSSEName<float>();
524 }
525 
526 BEAGLE_CPU_SSE_TEMPLATE
527 const char* BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::getName() {
528  return getBeagleCPUSSEName<double>();
529 }
530 
531 BEAGLE_CPU_SSE_TEMPLATE
532 const long BeagleCPUSSEImpl<BEAGLE_CPU_SSE_FLOAT>::getFlags() {
533  return BEAGLE_FLAG_COMPUTATION_SYNCH |
534  BEAGLE_FLAG_THREADING_NONE |
535  BEAGLE_FLAG_PROCESSOR_CPU |
536  BEAGLE_FLAG_PRECISION_SINGLE |
537  BEAGLE_FLAG_VECTOR_SSE;
538 }
539 
540 BEAGLE_CPU_SSE_TEMPLATE
541 const long BeagleCPUSSEImpl<BEAGLE_CPU_SSE_DOUBLE>::getFlags() {
542  return BEAGLE_FLAG_COMPUTATION_SYNCH |
543  BEAGLE_FLAG_THREADING_NONE |
544  BEAGLE_FLAG_PROCESSOR_CPU |
545  BEAGLE_FLAG_PRECISION_DOUBLE |
546  BEAGLE_FLAG_VECTOR_SSE;
547 }
548 
549 
551 // BeagleImplFactory public methods
552 
553 BEAGLE_CPU_FACTORY_TEMPLATE
554 BeagleImpl* BeagleCPUSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(int tipCount,
555  int partialsBufferCount,
556  int compactBufferCount,
557  int stateCount,
558  int patternCount,
559  int eigenBufferCount,
560  int matrixBufferCount,
561  int categoryCount,
562  int scaleBufferCount,
563  int resourceNumber,
564  long preferenceFlags,
565  long requirementFlags,
566  int* errorCode) {
567 
568  if (!CPUSupportsSSE())
569  return NULL;
570 
571  if (stateCount & 1) { // is odd
572  BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_ODD, P_PAD_SSE_ODD>* impl =
573  new BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_ODD, P_PAD_SSE_ODD>();
574 
575 
576  try {
577  if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
578  patternCount, eigenBufferCount, matrixBufferCount,
579  categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags) == 0)
580  return impl;
581  }
582  catch(...) {
583  if (DEBUGGING_OUTPUT)
584  std::cerr << "exception in initialize\n";
585  delete impl;
586  throw;
587  }
588 
589  delete impl;
590  } else {
591  BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_EVEN, P_PAD_SSE_EVEN>* impl =
592  new BeagleCPUSSEImpl<REALTYPE, T_PAD_SSE_EVEN, P_PAD_SSE_EVEN>();
593 
594 
595  try {
596  if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
597  patternCount, eigenBufferCount, matrixBufferCount,
598  categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags) == 0)
599  return impl;
600  }
601  catch(...) {
602  if (DEBUGGING_OUTPUT)
603  std::cerr << "exception in initialize\n";
604  delete impl;
605  throw;
606  }
607 
608  delete impl;
609  }
610 
611  return NULL;
612 }
613 
614 BEAGLE_CPU_FACTORY_TEMPLATE
615 const char* BeagleCPUSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() {
616  return getBeagleCPUSSEName<BEAGLE_CPU_FACTORY_GENERIC>();
617 }
618 
619 template <>
620 const long BeagleCPUSSEImplFactory<double>::getFlags() {
621  return BEAGLE_FLAG_COMPUTATION_SYNCH |
622  BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
623  BEAGLE_FLAG_THREADING_NONE |
624  BEAGLE_FLAG_PROCESSOR_CPU |
625  BEAGLE_FLAG_VECTOR_SSE |
626  BEAGLE_FLAG_PRECISION_DOUBLE |
627  BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
628  BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
629  BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
630 }
631 
632 template <>
633 const long BeagleCPUSSEImplFactory<float>::getFlags() {
634  return BEAGLE_FLAG_COMPUTATION_SYNCH |
635  BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
636  BEAGLE_FLAG_THREADING_NONE |
637  BEAGLE_FLAG_PROCESSOR_CPU |
638  BEAGLE_FLAG_VECTOR_SSE |
639  BEAGLE_FLAG_PRECISION_SINGLE |
640  BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
641  BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
642  BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
643 }
644 
645 // Code to save:
646 // int j = 0;
667 // sum1_vec = VEC_SETZERO();
668 // sum2_vec = VEC_SETZERO();
672 // Next snippet
673 //#if 1
674 // VEC_STORE_SCALAR(destP + u,
675 // VEC_MULT(
676 // VEC_ADD(sum1_vec, VEC_SWAP(sum1_vec)),
677 // VEC_ADD(sum2_vec, VEC_SWAP(sum2_vec))
678 // ));
679 //#else
680 // VecUnion t1, t2;
681 // t1.vx = sum1_vec;
682 // t2.vx = sum2_vec;
683 // destP[u] = (t1.x[0] + t1.x[1] //+ endSum1
684 // ) * (t2.x[0] + t2.x[1] //+ endSum2
685 // );
686 //#endif
687 
688 }
689 }
690 
691 #endif //BEAGLE_CPU_SSE_IMPL_HPP