HMSBEAGLE  1.0.0
BeagleCPU4StateSSEImpl.hpp
1 /*
2  * BeagleCPU4StateSSEImpl.cpp
3  * BEAGLE
4  *
5  * Copyright 2009 Phylogenetic Likelihood Working Group
6  *
7  * This file is part of BEAGLE.
8  *
9  * BEAGLE is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser General Public License as
11  * published by the Free Software Foundation, either version 3 of
12  * the License, or (at your option) any later version.
13  *
14  * BEAGLE is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with BEAGLE. If not, see
21  * <http://www.gnu.org/licenses/>.
22  *
23  * @author Marc Suchard
24  * @author David Swofford
25  */
26 
27 #ifndef BEAGLE_CPU_4STATE_SSE_IMPL_HPP
28 #define BEAGLE_CPU_4STATE_SSE_IMPL_HPP
29 
30 
31 #ifdef HAVE_CONFIG_H
32 #include "libhmsbeagle/config.h"
33 #endif
34 
35 #include <cstdio>
36 #include <cstdlib>
37 #include <iostream>
38 #include <cstring>
39 #include <cmath>
40 #include <cassert>
41 
42 #include "libhmsbeagle/beagle.h"
43 #include "libhmsbeagle/CPU/BeagleCPU4StateSSEImpl.h"
44 #include "libhmsbeagle/CPU/SSEDefinitions.h"
45 
46 /* Loads partials into SSE vectors */
47 #if 0
48 #define SSE_PREFETCH_PARTIALS(dest, src, v) \
49  dest##0 = VEC_SPLAT(src[v + 0]); \
50  dest##1 = VEC_SPLAT(src[v + 1]); \
51  dest##2 = VEC_SPLAT(src[v + 2]); \
52  dest##3 = VEC_SPLAT(src[v + 3]);
53 #else // Load four partials in two 128-bit memory transactions
54 #define SSE_PREFETCH_PARTIALS(dest, src, v) \
55  V_Real tmp_##dest##01, tmp_##dest##23; \
56  tmp_##dest##01 = _mm_load_pd(&src[v + 0]); \
57  tmp_##dest##23 = _mm_load_pd(&src[v + 2]); \
58  dest##0 = _mm_shuffle_pd(tmp_##dest##01, tmp_##dest##01, _MM_SHUFFLE2(0,0)); \
59  dest##1 = _mm_shuffle_pd(tmp_##dest##01, tmp_##dest##01, _MM_SHUFFLE2(1,1)); \
60  dest##2 = _mm_shuffle_pd(tmp_##dest##23, tmp_##dest##23, _MM_SHUFFLE2(0,0)); \
61  dest##3 = _mm_shuffle_pd(tmp_##dest##23, tmp_##dest##23, _MM_SHUFFLE2(1,1));
62 #endif
63 
64 /* Loads (transposed) finite-time transition matrices into SSE vectors */
65 #define SSE_PREFETCH_MATRICES(src_m1, src_m2, dest_vu_m1, dest_vu_m2) \
66  const double *m1 = (src_m1); \
67  const double *m2 = (src_m2); \
68  for (int i = 0; i < OFFSET; i++, m1++, m2++) { \
69  dest_vu_m1[i][0].x[0] = m1[0*OFFSET]; \
70  dest_vu_m1[i][0].x[1] = m1[1*OFFSET]; \
71  dest_vu_m2[i][0].x[0] = m2[0*OFFSET]; \
72  dest_vu_m2[i][0].x[1] = m2[1*OFFSET]; \
73  dest_vu_m1[i][1].x[0] = m1[2*OFFSET]; \
74  dest_vu_m1[i][1].x[1] = m1[3*OFFSET]; \
75  dest_vu_m2[i][1].x[0] = m2[2*OFFSET]; \
76  dest_vu_m2[i][1].x[1] = m2[3*OFFSET]; \
77  }
78 
79 #define SSE_PREFETCH_MATRIX(src_m1, dest_vu_m1) \
80  const double *m1 = (src_m1); \
81  for (int i = 0; i < OFFSET; i++, m1++) { \
82  dest_vu_m1[i][0].x[0] = m1[0*OFFSET]; \
83  dest_vu_m1[i][0].x[1] = m1[1*OFFSET]; \
84  dest_vu_m1[i][1].x[0] = m1[2*OFFSET]; \
85  dest_vu_m1[i][1].x[1] = m1[3*OFFSET]; \
86  }
87 
88 namespace beagle {
89 namespace cpu {
90 
91 
92 BEAGLE_CPU_FACTORY_TEMPLATE
93 inline const char* getBeagleCPU4StateSSEName(){ return "CPU-4State-SSE-Unknown"; };
94 
95 template<>
96 inline const char* getBeagleCPU4StateSSEName<double>(){ return "CPU-4State-SSE-Double"; };
97 
98 template<>
99 inline const char* getBeagleCPU4StateSSEName<float>(){ return "CPU-4State-SSE-Single"; };
100 
101 /*
102  * Calculates partial likelihoods at a node when both children have states.
103  */
104 
105 BEAGLE_CPU_4_SSE_TEMPLATE
106 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesStates(float* destP,
107  const int* states_q,
108  const float* matrices_q,
109  const int* states_r,
110  const float* matrices_r) {
111 
112  BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesStates(destP,
113  states_q,
114  matrices_q,
115  states_r,
116  matrices_r);
117 
118  }
119 
120 
121 BEAGLE_CPU_4_SSE_TEMPLATE
122 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcStatesStates(double* destP,
123  const int* states_q,
124  const double* matrices_q,
125  const int* states_r,
126  const double* matrices_r) {
127 
128  VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
129 
130  int w = 0;
131  V_Real *destPvec = (V_Real *)destP;
132 
133  for (int l = 0; l < kCategoryCount; l++) {
134 
135  SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
136 
137  for (int k = 0; k < kPatternCount; k++) {
138 
139  const int state_q = states_q[k];
140  const int state_r = states_r[k];
141 
142  *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, vu_mr[state_r][0].vx);
143  *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, vu_mr[state_r][1].vx);
144 
145  }
146 
147  w += OFFSET*4;
148  if (kExtraPatterns)
149  destPvec += kExtraPatterns * 2;
150  }
151 }
152 
153 /*
154  * Calculates partial likelihoods at a node when one child has states and one has partials.
155  SSE version
156  */
157 BEAGLE_CPU_4_SSE_TEMPLATE
158 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartials(float* destP,
159  const int* states_q,
160  const float* matrices_q,
161  const float* partials_r,
162  const float* matrices_r) {
163  BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartials(
164  destP,
165  states_q,
166  matrices_q,
167  partials_r,
168  matrices_r);
169 }
170 
171 
172 
173 BEAGLE_CPU_4_SSE_TEMPLATE
174 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcStatesPartials(double* destP,
175  const int* states_q,
176  const double* matrices_q,
177  const double* partials_r,
178  const double* matrices_r) {
179 
180  int v = 0;
181  int w = 0;
182 
183  VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
184  V_Real *destPvec = (V_Real *)destP;
185  V_Real destr_01, destr_23;
186 
187  for (int l = 0; l < kCategoryCount; l++) {
188 
189  SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
190 
191  for (int k = 0; k < kPatternCount; k++) {
192 
193  const int state_q = states_q[k];
194  V_Real vp0, vp1, vp2, vp3;
195  SSE_PREFETCH_PARTIALS(vp,partials_r,v);
196 
197  destr_01 = VEC_MULT(vp0, vu_mr[0][0].vx);
198  destr_01 = VEC_MADD(vp1, vu_mr[1][0].vx, destr_01);
199  destr_01 = VEC_MADD(vp2, vu_mr[2][0].vx, destr_01);
200  destr_01 = VEC_MADD(vp3, vu_mr[3][0].vx, destr_01);
201  destr_23 = VEC_MULT(vp0, vu_mr[0][1].vx);
202  destr_23 = VEC_MADD(vp1, vu_mr[1][1].vx, destr_23);
203  destr_23 = VEC_MADD(vp2, vu_mr[2][1].vx, destr_23);
204  destr_23 = VEC_MADD(vp3, vu_mr[3][1].vx, destr_23);
205 
206  *destPvec++ = VEC_MULT(vu_mq[state_q][0].vx, destr_01);
207  *destPvec++ = VEC_MULT(vu_mq[state_q][1].vx, destr_23);
208 
209  v += 4;
210  }
211  w += OFFSET*4;
212  if (kExtraPatterns) {
213  destPvec += kExtraPatterns * 2;
214  v += kExtraPatterns * 4;
215  }
216  }
217 }
218 
219 BEAGLE_CPU_4_SSE_TEMPLATE
220 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartialsFixedScaling(float* destP,
221  const int* states1,
222  const float* __restrict matrices1,
223  const float* __restrict partials2,
224  const float* __restrict matrices2,
225  const float* __restrict scaleFactors) {
226  BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcStatesPartialsFixedScaling(
227  destP,
228  states1,
229  matrices1,
230  partials2,
231  matrices2,
232  scaleFactors);
233 }
234 
235 BEAGLE_CPU_4_SSE_TEMPLATE
236 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcStatesPartialsFixedScaling(double* destP,
237  const int* states_q,
238  const double* __restrict matrices_q,
239  const double* __restrict partials_r,
240  const double* __restrict matrices_r,
241  const double* __restrict scaleFactors) {
242 
243 
244  int v = 0;
245  int w = 0;
246 
247  VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
248  V_Real *destPvec = (V_Real *)destP;
249  V_Real destr_01, destr_23;
250 
251  for (int l = 0; l < kCategoryCount; l++) {
252 
253  SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
254 
255  for (int k = 0; k < kPatternCount; k++) {
256 
257  const V_Real scaleFactor = VEC_SPLAT(scaleFactors[k]);
258 
259  const int state_q = states_q[k];
260  V_Real vp0, vp1, vp2, vp3;
261  SSE_PREFETCH_PARTIALS(vp,partials_r,v);
262 
263  destr_01 = VEC_MULT(vp0, vu_mr[0][0].vx);
264  destr_01 = VEC_MADD(vp1, vu_mr[1][0].vx, destr_01);
265  destr_01 = VEC_MADD(vp2, vu_mr[2][0].vx, destr_01);
266  destr_01 = VEC_MADD(vp3, vu_mr[3][0].vx, destr_01);
267  destr_23 = VEC_MULT(vp0, vu_mr[0][1].vx);
268  destr_23 = VEC_MADD(vp1, vu_mr[1][1].vx, destr_23);
269  destr_23 = VEC_MADD(vp2, vu_mr[2][1].vx, destr_23);
270  destr_23 = VEC_MADD(vp3, vu_mr[3][1].vx, destr_23);
271 
272  *destPvec++ = VEC_DIV(VEC_MULT(vu_mq[state_q][0].vx, destr_01), scaleFactor);
273  *destPvec++ = VEC_DIV(VEC_MULT(vu_mq[state_q][1].vx, destr_23), scaleFactor);
274 
275  v += 4;
276  }
277  w += OFFSET*4;
278  if (kExtraPatterns) {
279  destPvec += kExtraPatterns * 2;
280  v += kExtraPatterns * 4;
281  }
282  }
283 }
284 
285 BEAGLE_CPU_4_SSE_TEMPLATE
286 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartials(float* destP,
287  const float* partials_q,
288  const float* matrices_q,
289  const float* partials_r,
290  const float* matrices_r) {
291 
292  BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartials(destP,
293  partials_q,
294  matrices_q,
295  partials_r,
296  matrices_r);
297 }
298 
299 BEAGLE_CPU_4_SSE_TEMPLATE
300 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartials(double* destP,
301  const double* partials_q,
302  const double* matrices_q,
303  const double* partials_r,
304  const double* matrices_r) {
305 
306  int v = 0;
307  int w = 0;
308 
309  V_Real destq_01, destq_23, destr_01, destr_23;
310  VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
311  V_Real *destPvec = (V_Real *)destP;
312 
313  for (int l = 0; l < kCategoryCount; l++) {
314 
315  /* Load transition-probability matrices into vectors */
316  SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
317 
318  for (int k = 0; k < kPatternCount; k++) {
319 
320 # if 1 && !defined(_WIN32)
321  __builtin_prefetch (&partials_q[v+64]);
322  __builtin_prefetch (&partials_r[v+64]);
323 // __builtin_prefetch (destPvec+32,1,0);
324 # endif
325 
326  V_Real vpq_0, vpq_1, vpq_2, vpq_3;
327  SSE_PREFETCH_PARTIALS(vpq_,partials_q,v);
328 
329  V_Real vpr_0, vpr_1, vpr_2, vpr_3;
330  SSE_PREFETCH_PARTIALS(vpr_,partials_r,v);
331 
332 # if 1 /* This would probably be faster on PPC/Altivec, which has a fused multiply-add
333  vector instruction */
334 
335  destq_01 = VEC_MULT(vpq_0, vu_mq[0][0].vx);
336  destq_01 = VEC_MADD(vpq_1, vu_mq[1][0].vx, destq_01);
337  destq_01 = VEC_MADD(vpq_2, vu_mq[2][0].vx, destq_01);
338  destq_01 = VEC_MADD(vpq_3, vu_mq[3][0].vx, destq_01);
339  destq_23 = VEC_MULT(vpq_0, vu_mq[0][1].vx);
340  destq_23 = VEC_MADD(vpq_1, vu_mq[1][1].vx, destq_23);
341  destq_23 = VEC_MADD(vpq_2, vu_mq[2][1].vx, destq_23);
342  destq_23 = VEC_MADD(vpq_3, vu_mq[3][1].vx, destq_23);
343 
344  destr_01 = VEC_MULT(vpr_0, vu_mr[0][0].vx);
345  destr_01 = VEC_MADD(vpr_1, vu_mr[1][0].vx, destr_01);
346  destr_01 = VEC_MADD(vpr_2, vu_mr[2][0].vx, destr_01);
347  destr_01 = VEC_MADD(vpr_3, vu_mr[3][0].vx, destr_01);
348  destr_23 = VEC_MULT(vpr_0, vu_mr[0][1].vx);
349  destr_23 = VEC_MADD(vpr_1, vu_mr[1][1].vx, destr_23);
350  destr_23 = VEC_MADD(vpr_2, vu_mr[2][1].vx, destr_23);
351  destr_23 = VEC_MADD(vpr_3, vu_mr[3][1].vx, destr_23);
352 
353 # else /* SSE doesn't have a fused multiply-add, so a slight speed gain should be
354  achieved by decoupling these operations to avoid dependency stalls */
355 
356  V_Real a, b, c, d;
357 
358  a = VEC_MULT(vpq_0, vu_mq[0][0].vx);
359  b = VEC_MULT(vpq_2, vu_mq[2][0].vx);
360  c = VEC_MULT(vpq_0, vu_mq[0][1].vx);
361  d = VEC_MULT(vpq_2, vu_mq[2][1].vx);
362  a = VEC_MADD(vpq_1, vu_mq[1][0].vx, a);
363  b = VEC_MADD(vpq_3, vu_mq[3][0].vx, b);
364  c = VEC_MADD(vpq_1, vu_mq[1][1].vx, c);
365  d = VEC_MADD(vpq_3, vu_mq[3][1].vx, d);
366  destq_01 = VEC_ADD(a, b);
367  destq_23 = VEC_ADD(c, d);
368 
369  a = VEC_MULT(vpr_0, vu_mr[0][0].vx);
370  b = VEC_MULT(vpr_2, vu_mr[2][0].vx);
371  c = VEC_MULT(vpr_0, vu_mr[0][1].vx);
372  d = VEC_MULT(vpr_2, vu_mr[2][1].vx);
373  a = VEC_MADD(vpr_1, vu_mr[1][0].vx, a);
374  b = VEC_MADD(vpr_3, vu_mr[3][0].vx, b);
375  c = VEC_MADD(vpr_1, vu_mr[1][1].vx, c);
376  d = VEC_MADD(vpr_3, vu_mr[3][1].vx, d);
377  destr_01 = VEC_ADD(a, b);
378  destr_23 = VEC_ADD(c, d);
379 
380 # endif
381 
382 # if 1//
383  destPvec[0] = VEC_MULT(destq_01, destr_01);
384  destPvec[1] = VEC_MULT(destq_23, destr_23);
385  destPvec += 2;
386 
387 # else /* VEC_STORE did demonstrate a measurable performance gain as
388  it copies all (2/4) values to memory simultaneously;
389  I can no longer reproduce the performance gain (?) */
390 
391  VEC_STORE(destP + v + 0,VEC_MULT(destq_01, destr_01));
392  VEC_STORE(destP + v + 2,VEC_MULT(destq_23, destr_23));
393 
394 # endif
395 
396  v += 4;
397  }
398  w += OFFSET*4;
399  if (kExtraPatterns) {
400  destPvec += kExtraPatterns * 2;
401  v += kExtraPatterns * 4;
402  }
403  }
404 }
405 
406 BEAGLE_CPU_4_SSE_TEMPLATE
407 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsFixedScaling(float* destP,
408  const float* child0Partials,
409  const float* child0TransMat,
410  const float* child1Partials,
411  const float* child1TransMat,
412  const float* scaleFactors) {
413 
414  BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsFixedScaling(
415  destP,
416  child0Partials,
417  child0TransMat,
418  child1Partials,
419  child1TransMat,
420  scaleFactors);
421 }
422 
423 BEAGLE_CPU_4_SSE_TEMPLATE
424 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartialsFixedScaling(double* destP,
425  const double* partials_q,
426  const double* matrices_q,
427  const double* partials_r,
428  const double* matrices_r,
429  const double* scaleFactors) {
430 
431  int v = 0;
432  int w = 0;
433 
434  V_Real destq_01, destq_23, destr_01, destr_23;
435  VecUnion vu_mq[OFFSET][2], vu_mr[OFFSET][2];
436  V_Real *destPvec = (V_Real *)destP;
437 
438  for (int l = 0; l < kCategoryCount; l++) {
439 
440  /* Load transition-probability matrices into vectors */
441  SSE_PREFETCH_MATRICES(matrices_q + w, matrices_r + w, vu_mq, vu_mr);
442 
443  for (int k = 0; k < kPatternCount; k++) {
444 
445 # if 1 && !defined(_WIN32)
446  __builtin_prefetch (&partials_q[v+64]);
447  __builtin_prefetch (&partials_r[v+64]);
448  // __builtin_prefetch (destPvec+32,1,0);
449 # endif
450 
451  // Prefetch scale factor
452 // const V_Real scaleFactor = VEC_LOAD_SCALAR(scaleFactors + k);
453  // Option below appears faster, why?
454  const V_Real scaleFactor = VEC_SPLAT(scaleFactors[k]);
455 
456  V_Real vpq_0, vpq_1, vpq_2, vpq_3;
457  SSE_PREFETCH_PARTIALS(vpq_,partials_q,v);
458 
459  V_Real vpr_0, vpr_1, vpr_2, vpr_3;
460  SSE_PREFETCH_PARTIALS(vpr_,partials_r,v);
461 
462  // TODO Make below into macro since this repeats from other calcPPs
463  destq_01 = VEC_MULT(vpq_0, vu_mq[0][0].vx);
464  destq_01 = VEC_MADD(vpq_1, vu_mq[1][0].vx, destq_01);
465  destq_01 = VEC_MADD(vpq_2, vu_mq[2][0].vx, destq_01);
466  destq_01 = VEC_MADD(vpq_3, vu_mq[3][0].vx, destq_01);
467  destq_23 = VEC_MULT(vpq_0, vu_mq[0][1].vx);
468  destq_23 = VEC_MADD(vpq_1, vu_mq[1][1].vx, destq_23);
469  destq_23 = VEC_MADD(vpq_2, vu_mq[2][1].vx, destq_23);
470  destq_23 = VEC_MADD(vpq_3, vu_mq[3][1].vx, destq_23);
471 
472  destr_01 = VEC_MULT(vpr_0, vu_mr[0][0].vx);
473  destr_01 = VEC_MADD(vpr_1, vu_mr[1][0].vx, destr_01);
474  destr_01 = VEC_MADD(vpr_2, vu_mr[2][0].vx, destr_01);
475  destr_01 = VEC_MADD(vpr_3, vu_mr[3][0].vx, destr_01);
476  destr_23 = VEC_MULT(vpr_0, vu_mr[0][1].vx);
477  destr_23 = VEC_MADD(vpr_1, vu_mr[1][1].vx, destr_23);
478  destr_23 = VEC_MADD(vpr_2, vu_mr[2][1].vx, destr_23);
479  destr_23 = VEC_MADD(vpr_3, vu_mr[3][1].vx, destr_23);
480 
481  destPvec[0] = VEC_DIV(VEC_MULT(destq_01, destr_01), scaleFactor);
482  destPvec[1] = VEC_DIV(VEC_MULT(destq_23, destr_23), scaleFactor);
483 
484  destPvec += 2;
485  v += 4;
486  }
487  w += OFFSET*4;
488  if (kExtraPatterns) {
489  destPvec += kExtraPatterns * 2;
490  v += kExtraPatterns * 4;
491  }
492  }
493 }
494 
495 
496 BEAGLE_CPU_4_SSE_TEMPLATE
497 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsAutoScaling(float* destP,
498  const float* partials_q,
499  const float* matrices_q,
500  const float* partials_r,
501  const float* matrices_r,
502  int* activateScaling) {
503  BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcPartialsPartialsAutoScaling(destP,
504  partials_q,
505  matrices_q,
506  partials_r,
507  matrices_r,
508  activateScaling);
509 }
510 
511 BEAGLE_CPU_4_SSE_TEMPLATE
512 void BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(double* destP,
513  const double* partials_q,
514  const double* matrices_q,
515  const double* partials_r,
516  const double* matrices_r,
517  int* activateScaling) {
518  // TODO: implement calcPartialsPartialsAutoScaling with SSE
519  BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcPartialsPartialsAutoScaling(destP,
520  partials_q,
521  matrices_q,
522  partials_r,
523  matrices_r,
524  activateScaling);
525 }
526 
527 BEAGLE_CPU_4_SSE_TEMPLATE
528 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcEdgeLogLikelihoods(const int parIndex,
529  const int childIndex,
530  const int probIndex,
531  const int categoryWeightsIndex,
532  const int stateFrequenciesIndex,
533  const int scalingFactorsIndex,
534  double* outSumLogLikelihood) {
535  return BeagleCPU4StateImpl<BEAGLE_CPU_4_SSE_FLOAT>::calcEdgeLogLikelihoods(
536  parIndex,
537  childIndex,
538  probIndex,
539  categoryWeightsIndex,
540  stateFrequenciesIndex,
541  scalingFactorsIndex,
542  outSumLogLikelihood);
543 }
544 
545 BEAGLE_CPU_4_SSE_TEMPLATE
546 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::calcEdgeLogLikelihoods(const int parIndex,
547  const int childIndex,
548  const int probIndex,
549  const int categoryWeightsIndex,
550  const int stateFrequenciesIndex,
551  const int scalingFactorsIndex,
552  double* outSumLogLikelihood) {
553  // TODO: implement derivatives for calculateEdgeLnL
554 
555  int returnCode = BEAGLE_SUCCESS;
556 
557  assert(parIndex >= kTipCount);
558 
559  const double* cl_r = gPartials[parIndex];
560  double* cl_p = integrationTmp;
561  const double* transMatrix = gTransitionMatrices[probIndex];
562  const double* wt = gCategoryWeights[categoryWeightsIndex];
563  const double* freqs = gStateFrequencies[stateFrequenciesIndex];
564 
565  memset(cl_p, 0, (kPatternCount * kStateCount)*sizeof(double));
566 
567  if (childIndex < kTipCount && gTipStates[childIndex]) { // Integrate against a state at the child
568 
569  const int* statesChild = gTipStates[childIndex];
570 
571  int w = 0;
572  V_Real *vcl_r = (V_Real *)cl_r;
573  for(int l = 0; l < kCategoryCount; l++) {
574 
575  VecUnion vu_m[OFFSET][2];
576  SSE_PREFETCH_MATRIX(transMatrix + w, vu_m)
577 
578  V_Real *vcl_p = (V_Real *)cl_p;
579 
580  for(int k = 0; k < kPatternCount; k++) {
581 
582  const int stateChild = statesChild[k];
583  V_Real vwt = VEC_SPLAT(wt[l]);
584 
585  V_Real wtdPartials = VEC_MULT(*vcl_r++, vwt);
586  *vcl_p++ = VEC_MADD(vu_m[stateChild][0].vx, wtdPartials, *vcl_p);
587 
588  wtdPartials = VEC_MULT(*vcl_r++, vwt);
589  *vcl_p++ = VEC_MADD(vu_m[stateChild][1].vx, wtdPartials, *vcl_p);
590  }
591  w += OFFSET*4;
592  vcl_r += 2 * kExtraPatterns;
593  }
594  } else { // Integrate against a partial at the child
595 
596  const double* cl_q = gPartials[childIndex];
597  V_Real * vcl_r = (V_Real *)cl_r;
598  int v = 0;
599  int w = 0;
600 
601  for(int l = 0; l < kCategoryCount; l++) {
602 
603  V_Real * vcl_p = (V_Real *)cl_p;
604 
605  VecUnion vu_m[OFFSET][2];
606  SSE_PREFETCH_MATRIX(transMatrix + w, vu_m)
607 
608  for(int k = 0; k < kPatternCount; k++) {
609  V_Real vclp_01, vclp_23;
610  V_Real vwt = VEC_SPLAT(wt[l]);
611 
612  V_Real vcl_q0, vcl_q1, vcl_q2, vcl_q3;
613  SSE_PREFETCH_PARTIALS(vcl_q,cl_q,v);
614 
615  vclp_01 = VEC_MULT(vcl_q0, vu_m[0][0].vx);
616  vclp_01 = VEC_MADD(vcl_q1, vu_m[1][0].vx, vclp_01);
617  vclp_01 = VEC_MADD(vcl_q2, vu_m[2][0].vx, vclp_01);
618  vclp_01 = VEC_MADD(vcl_q3, vu_m[3][0].vx, vclp_01);
619  vclp_23 = VEC_MULT(vcl_q0, vu_m[0][1].vx);
620  vclp_23 = VEC_MADD(vcl_q1, vu_m[1][1].vx, vclp_23);
621  vclp_23 = VEC_MADD(vcl_q2, vu_m[2][1].vx, vclp_23);
622  vclp_23 = VEC_MADD(vcl_q3, vu_m[3][1].vx, vclp_23);
623  vclp_01 = VEC_MULT(vclp_01, vwt);
624  vclp_23 = VEC_MULT(vclp_23, vwt);
625 
626  *vcl_p++ = VEC_MADD(vclp_01, *vcl_r++, *vcl_p);
627  *vcl_p++ = VEC_MADD(vclp_23, *vcl_r++, *vcl_p);
628 
629  v += 4;
630  }
631  w += 4*OFFSET;
632  if (kExtraPatterns) {
633  vcl_r += 2 * kExtraPatterns;
634  v += 4 * kExtraPatterns;
635  }
636 
637  }
638  }
639 
640  int u = 0;
641  for(int k = 0; k < kPatternCount; k++) {
642  double sumOverI = 0.0;
643  for(int i = 0; i < kStateCount; i++) {
644  sumOverI += freqs[i] * cl_p[u];
645  u++;
646  }
647 
648  outLogLikelihoodsTmp[k] = log(sumOverI);
649  }
650 
651 
652  if (scalingFactorsIndex != BEAGLE_OP_NONE) {
653  const double* scalingFactors = gScaleBuffers[scalingFactorsIndex];
654  for(int k=0; k < kPatternCount; k++)
655  outLogLikelihoodsTmp[k] += scalingFactors[k];
656  }
657 
658  *outSumLogLikelihood = 0.0;
659  for (int i = 0; i < kPatternCount; i++) {
660  *outSumLogLikelihood += outLogLikelihoodsTmp[i] * gPatternWeights[i];
661  }
662 
663  if (!(*outSumLogLikelihood - *outSumLogLikelihood == 0.0))
664  returnCode = BEAGLE_ERROR_FLOATING_POINT;
665 
666  return returnCode;
667 }
668 
669 
670 BEAGLE_CPU_4_SSE_TEMPLATE
671 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::getPaddedPatternsModulus() {
672  return 1; // We currently do not vectorize across patterns
673 // return 4; // For single-precision, can operate on 4 patterns at a time
674  // TODO Vectorize final log operations over patterns
675 }
676 
677 BEAGLE_CPU_4_SSE_TEMPLATE
678 int BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::getPaddedPatternsModulus() {
679 // return 2; // For double-precision, can operate on 2 patterns at a time
680  return 1; // We currently do not vectorize across patterns
681 }
682 
683 BEAGLE_CPU_4_SSE_TEMPLATE
684 const char* BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::getName() {
685  return getBeagleCPU4StateSSEName<float>();
686 }
687 
688 BEAGLE_CPU_4_SSE_TEMPLATE
689 const char* BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::getName() {
690  return getBeagleCPU4StateSSEName<double>();
691 }
692 
693 
694 BEAGLE_CPU_4_SSE_TEMPLATE
695 const long BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_FLOAT>::getFlags() {
696  return BEAGLE_FLAG_COMPUTATION_SYNCH |
697  BEAGLE_FLAG_THREADING_NONE |
698  BEAGLE_FLAG_PROCESSOR_CPU |
699  BEAGLE_FLAG_PRECISION_SINGLE |
700  BEAGLE_FLAG_VECTOR_SSE;
701 }
702 
703 BEAGLE_CPU_4_SSE_TEMPLATE
704 const long BeagleCPU4StateSSEImpl<BEAGLE_CPU_4_SSE_DOUBLE>::getFlags() {
705  return BEAGLE_FLAG_COMPUTATION_SYNCH |
706  BEAGLE_FLAG_THREADING_NONE |
707  BEAGLE_FLAG_PROCESSOR_CPU |
708  BEAGLE_FLAG_PRECISION_DOUBLE |
709  BEAGLE_FLAG_VECTOR_SSE;
710 }
711 
712 
713 
715 // BeagleImplFactory public methods
716 
717 BEAGLE_CPU_FACTORY_TEMPLATE
718 BeagleImpl* BeagleCPU4StateSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::createImpl(int tipCount,
719  int partialsBufferCount,
720  int compactBufferCount,
721  int stateCount,
722  int patternCount,
723  int eigenBufferCount,
724  int matrixBufferCount,
725  int categoryCount,
726  int scaleBufferCount,
727  int resourceNumber,
728  long preferenceFlags,
729  long requirementFlags,
730  int* errorCode) {
731 
732  if (stateCount != 4) {
733  return NULL;
734  }
735 
736  BeagleCPU4StateSSEImpl<REALTYPE, T_PAD_4_SSE_DEFAULT, P_PAD_4_SSE_DEFAULT>* impl =
737  new BeagleCPU4StateSSEImpl<REALTYPE, T_PAD_4_SSE_DEFAULT, P_PAD_4_SSE_DEFAULT>();
738 
739  if (!CPUSupportsSSE()) {
740  delete impl;
741  return NULL;
742  }
743 
744  try {
745  if (impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
746  patternCount, eigenBufferCount, matrixBufferCount,
747  categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags) == 0)
748  return impl;
749  }
750  catch(...) {
751  if (DEBUGGING_OUTPUT)
752  std::cerr << "exception in initialize\n";
753  delete impl;
754  throw;
755  }
756 
757  delete impl;
758 
759  return NULL;
760 }
761 
762 BEAGLE_CPU_FACTORY_TEMPLATE
763 const char* BeagleCPU4StateSSEImplFactory<BEAGLE_CPU_FACTORY_GENERIC>::getName() {
764  return getBeagleCPU4StateSSEName<BEAGLE_CPU_FACTORY_GENERIC>();
765 }
766 
767 template <>
768 const long BeagleCPU4StateSSEImplFactory<double>::getFlags() {
769  return BEAGLE_FLAG_COMPUTATION_SYNCH |
770  BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
771  BEAGLE_FLAG_THREADING_NONE |
772  BEAGLE_FLAG_PROCESSOR_CPU |
773  BEAGLE_FLAG_VECTOR_SSE |
774  BEAGLE_FLAG_PRECISION_DOUBLE |
775  BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
776  BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL|
777  BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
778 }
779 
780 template <>
781 const long BeagleCPU4StateSSEImplFactory<float>::getFlags() {
782  return BEAGLE_FLAG_COMPUTATION_SYNCH |
783  BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO |
784  BEAGLE_FLAG_THREADING_NONE |
785  BEAGLE_FLAG_PROCESSOR_CPU |
786  BEAGLE_FLAG_VECTOR_SSE |
787  BEAGLE_FLAG_PRECISION_SINGLE |
788  BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
789  BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
790  BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
791 }
792 
793 
794 }
795 }
796 
797 #endif //BEAGLE_CPU_4STATE_SSE_IMPL_HPP