HMSBEAGLE  1.0.0
BeagleGPUImpl.hpp
1 
2 /*
3  * BeagleGPUImpl.cpp
4  * BEAGLE
5  *
6  * Copyright 2009 Phylogenetic Likelihood Working Group
7  *
8  * This file is part of BEAGLE.
9  *
10  * BEAGLE is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU Lesser General Public License as
12  * published by the Free Software Foundation, either version 3 of
13  * the License, or (at your option) any later version.
14  *
15  * BEAGLE is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18  * GNU Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with BEAGLE. If not, see
22  * <http://www.gnu.org/licenses/>.
23  *
24  * @author Marc Suchard
25  * @author Andrew Rambaut
26  * @author Daniel Ayres
27  * @author Aaron Darling
28  */
29 #ifdef HAVE_CONFIG_H
30 #include "libhmsbeagle/config.h"
31 #endif
32 
33 #include <cstdio>
34 #include <cstdlib>
35 #include <cassert>
36 #include <iostream>
37 #include <cstring>
38 
39 #include "libhmsbeagle/beagle.h"
40 #include "libhmsbeagle/GPU/GPUImplDefs.h"
41 #include "libhmsbeagle/GPU/BeagleGPUImpl.h"
42 #include "libhmsbeagle/GPU/GPUImplHelper.h"
43 #include "libhmsbeagle/GPU/KernelLauncher.h"
44 #include "libhmsbeagle/GPU/GPUInterface.h"
45 #include "libhmsbeagle/GPU/Precision.h"
46 
47 namespace beagle {
48 namespace gpu {
49 
50 BEAGLE_GPU_TEMPLATE
51 BeagleGPUImpl<BEAGLE_GPU_GENERIC>::BeagleGPUImpl() {
52 
53  gpu = NULL;
54  kernels = NULL;
55 
56  dIntegrationTmp = (GPUPtr)NULL;
57  dOutFirstDeriv = (GPUPtr)NULL;
58  dOutSecondDeriv = (GPUPtr)NULL;
59  dPartialsTmp = (GPUPtr)NULL;
60  dFirstDerivTmp = (GPUPtr)NULL;
61  dSecondDerivTmp = (GPUPtr)NULL;
62 
63  dSumLogLikelihood = (GPUPtr)NULL;
64  dSumFirstDeriv = (GPUPtr)NULL;
65  dSumSecondDeriv = (GPUPtr)NULL;
66 
67  dPatternWeights = (GPUPtr)NULL;
68 
69  dBranchLengths = (GPUPtr)NULL;
70 
71  dDistanceQueue = (GPUPtr)NULL;
72 
73  dPtrQueue = (GPUPtr)NULL;
74 
75  dMaxScalingFactors = (GPUPtr)NULL;
76  dIndexMaxScalingFactors = (GPUPtr)NULL;
77 
78  dEigenValues = NULL;
79  dEvec = NULL;
80  dIevc = NULL;
81 
82  dWeights = NULL;
83  dFrequencies = NULL;
84 
85  dScalingFactors = NULL;
86 
87  dStates = NULL;
88 
89  dPartials = NULL;
90  dMatrices = NULL;
91 
92  dCompactBuffers = NULL;
93  dTipPartialsBuffers = NULL;
94 
95  hPtrQueue = NULL;
96 
97  hCategoryRates = NULL;
98 
99  hPatternWeightsCache = NULL;
100 
101  hDistanceQueue = NULL;
102 
103  hWeightsCache = NULL;
104  hFrequenciesCache = NULL;
105  hLogLikelihoodsCache = NULL;
106  hPartialsCache = NULL;
107  hStatesCache = NULL;
108  hMatrixCache = NULL;
109 
110  hRescalingTrigger = NULL;
111  dRescalingTrigger = (GPUPtr)NULL;
112  dScalingFactorsMaster = NULL;
113 }
114 
115 BEAGLE_GPU_TEMPLATE
116 BeagleGPUImpl<BEAGLE_GPU_GENERIC>::~BeagleGPUImpl() {
117 
118  if (kInitialized) {
119  for (int i=0; i < kEigenDecompCount; i++) {
120  gpu->FreeMemory(dEigenValues[i]);
121  gpu->FreeMemory(dEvec[i]);
122  gpu->FreeMemory(dIevc[i]);
123  gpu->FreeMemory(dWeights[i]);
124  gpu->FreeMemory(dFrequencies[i]);
125  }
126 
127  gpu->FreeMemory(dMatrices[0]);
128 
129  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
130  gpu->FreePinnedHostMemory(hRescalingTrigger);
131  for (int i = 0; i < kScaleBufferCount; i++) {
132  if (dScalingFactorsMaster[i] != 0)
133  gpu->FreeMemory(dScalingFactorsMaster[i]);
134  }
135  free(dScalingFactorsMaster);
136  } else {
137  if (kScaleBufferCount > 0)
138  gpu->FreeMemory(dScalingFactors[0]);
139  }
140 
141  for (int i = 0; i < kBufferCount; i++) {
142  if (i < kTipCount) { // For the tips
143  if (i < kCompactBufferCount)
144  gpu->FreeMemory(dCompactBuffers[i]);
145  if (i < kTipPartialsBufferCount)
146  gpu->FreeMemory(dTipPartialsBuffers[i]);
147  } else {
148  gpu->FreeMemory(dPartials[i]);
149  }
150  }
151 
152  gpu->FreeMemory(dIntegrationTmp);
153  gpu->FreeMemory(dOutFirstDeriv);
154  gpu->FreeMemory(dOutSecondDeriv);
155  gpu->FreeMemory(dPartialsTmp);
156  gpu->FreeMemory(dFirstDerivTmp);
157  gpu->FreeMemory(dSecondDerivTmp);
158 
159  gpu->FreeMemory(dSumLogLikelihood);
160  gpu->FreeMemory(dSumFirstDeriv);
161  gpu->FreeMemory(dSumSecondDeriv);
162 
163  gpu->FreeMemory(dPatternWeights);
164 
165  gpu->FreeMemory(dBranchLengths);
166 
167  gpu->FreeMemory(dDistanceQueue);
168 
169  gpu->FreeMemory(dPtrQueue);
170 
171  gpu->FreeMemory(dMaxScalingFactors);
172  gpu->FreeMemory(dIndexMaxScalingFactors);
173 
174  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
175  gpu->FreeMemory(dAccumulatedScalingFactors);
176  }
177 
178  free(dEigenValues);
179  free(dEvec);
180  free(dIevc);
181 
182  free(dWeights);
183  free(dFrequencies);
184 
185  free(dScalingFactors);
186 
187  free(dStates);
188 
189  free(dPartials);
190  free(dMatrices);
191 
192  free(dCompactBuffers);
193  free(dTipPartialsBuffers);
194 
195  gpu->FreeHostMemory(hPtrQueue);
196 
197  gpu->FreeHostMemory(hCategoryRates);
198 
199  gpu->FreeHostMemory(hPatternWeightsCache);
200 
201  gpu->FreeHostMemory(hDistanceQueue);
202 
203  gpu->FreeHostMemory(hWeightsCache);
204  gpu->FreeHostMemory(hFrequenciesCache);
205  gpu->FreeHostMemory(hPartialsCache);
206  gpu->FreeHostMemory(hStatesCache);
207 
208  gpu->FreeHostMemory(hLogLikelihoodsCache);
209  gpu->FreeHostMemory(hMatrixCache);
210 
211  }
212 
213  if (kernels)
214  delete kernels;
215  if (gpu)
216  delete gpu;
217 
218 }
219 
220 BEAGLE_GPU_TEMPLATE
221 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::createInstance(int tipCount,
222  int partialsBufferCount,
223  int compactBufferCount,
224  int stateCount,
225  int patternCount,
226  int eigenDecompositionCount,
227  int matrixCount,
228  int categoryCount,
229  int scaleBufferCount,
230  int iResourceNumber,
231  long preferenceFlags,
232  long requirementFlags) {
233 
234 #ifdef BEAGLE_DEBUG_FLOW
235  fprintf(stderr, "\tEntering BeagleGPUImpl::createInstance\n");
236 #endif
237 
238  kInitialized = 0;
239 
240  kTipCount = tipCount;
241  kPartialsBufferCount = partialsBufferCount;
242  kCompactBufferCount = compactBufferCount;
243  kStateCount = stateCount;
244  kPatternCount = patternCount;
245  kEigenDecompCount = eigenDecompositionCount;
246  kMatrixCount = matrixCount;
247  kCategoryCount = categoryCount;
248  kScaleBufferCount = scaleBufferCount;
249 
250  resourceNumber = iResourceNumber;
251 
252  kTipPartialsBufferCount = kTipCount - kCompactBufferCount;
253  kBufferCount = kPartialsBufferCount + kCompactBufferCount;
254 
255  kInternalPartialsBufferCount = kBufferCount - kTipCount;
256 
257  if (kStateCount <= 4)
258  kPaddedStateCount = 4;
259  else if (kStateCount <= 16)
260  kPaddedStateCount = 16;
261  else if (kStateCount <= 32)
262  kPaddedStateCount = 32;
263  else if (kStateCount <= 48)
264  kPaddedStateCount = 48;
265  else if (kStateCount <= 64)
266  kPaddedStateCount = 64;
267  else if (kStateCount <= 80)
268  kPaddedStateCount = 80;
269  else if (kStateCount <= 128)
270  kPaddedStateCount = 128;
271  else if (kStateCount <= 192)
272  kPaddedStateCount = 192;
273  else
274  kPaddedStateCount = kStateCount + kStateCount % 16;
275 
276  // Make sure that kPaddedPatternCount + paddedPatterns is multiple of 4 for DNA model
277  int paddedPatterns = 0;
278  if (kPaddedStateCount == 4 && kPatternCount % 4 != 0)
279  paddedPatterns = 4 - kPatternCount % 4;
280 
281  // TODO Should do something similar for 4 < kStateCount <= 8 as well
282 
283  kPaddedPatternCount = kPatternCount + paddedPatterns;
284 
285  int resultPaddedPatterns = 0;
286  int patternBlockSizeFour = (kFlags & BEAGLE_FLAG_PRECISION_DOUBLE ? PATTERN_BLOCK_SIZE_DP_4 : PATTERN_BLOCK_SIZE_SP_4);
287  if (kPaddedStateCount == 4 && kPaddedPatternCount % patternBlockSizeFour != 0)
288  resultPaddedPatterns = patternBlockSizeFour - kPaddedPatternCount % patternBlockSizeFour;
289 
290  kScaleBufferSize = kPaddedPatternCount;
291 
292  kFlags = 0;
293 
294  if (preferenceFlags & BEAGLE_FLAG_SCALING_AUTO || requirementFlags & BEAGLE_FLAG_SCALING_AUTO) {
295  kFlags |= BEAGLE_FLAG_SCALING_AUTO;
296  kFlags |= BEAGLE_FLAG_SCALERS_LOG;
297  kScaleBufferCount = kInternalPartialsBufferCount;
298  kScaleBufferSize *= kCategoryCount;
299  } else if (preferenceFlags & BEAGLE_FLAG_SCALING_ALWAYS || requirementFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
300  kFlags |= BEAGLE_FLAG_SCALING_ALWAYS;
301  kFlags |= BEAGLE_FLAG_SCALERS_LOG;
302  kScaleBufferCount = kInternalPartialsBufferCount + 1; // +1 for temp buffer used by edgelikelihood
303  } else if (preferenceFlags & BEAGLE_FLAG_SCALING_DYNAMIC || requirementFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
304  kFlags |= BEAGLE_FLAG_SCALING_DYNAMIC;
305  kFlags |= BEAGLE_FLAG_SCALERS_RAW;
306  } else if (preferenceFlags & BEAGLE_FLAG_SCALERS_LOG || requirementFlags & BEAGLE_FLAG_SCALERS_LOG) {
307  kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
308  kFlags |= BEAGLE_FLAG_SCALERS_LOG;
309  } else {
310  kFlags |= BEAGLE_FLAG_SCALING_MANUAL;
311  kFlags |= BEAGLE_FLAG_SCALERS_RAW;
312  }
313 
314  if (preferenceFlags & BEAGLE_FLAG_EIGEN_COMPLEX || requirementFlags & BEAGLE_FLAG_EIGEN_COMPLEX) {
315  kFlags |= BEAGLE_FLAG_EIGEN_COMPLEX;
316  } else {
317  kFlags |= BEAGLE_FLAG_EIGEN_REAL;
318  }
319 
320  if (requirementFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED || preferenceFlags & BEAGLE_FLAG_INVEVEC_TRANSPOSED)
321  kFlags |= BEAGLE_FLAG_INVEVEC_TRANSPOSED;
322  else
323  kFlags |= BEAGLE_FLAG_INVEVEC_STANDARD;
324 
325  Real r = 0;
326  modifyFlagsForPrecision(&kFlags, r);
327 
328  int sumSitesBlockSize = (kFlags & BEAGLE_FLAG_PRECISION_DOUBLE ? SUM_SITES_BLOCK_SIZE_DP : SUM_SITES_BLOCK_SIZE_SP);
329  kSumSitesBlockCount = kPatternCount / sumSitesBlockSize;
330  if (kPatternCount % sumSitesBlockSize != 0)
331  kSumSitesBlockCount += 1;
332 
333  kPartialsSize = kPaddedPatternCount * kPaddedStateCount * kCategoryCount;
334  kMatrixSize = kPaddedStateCount * kPaddedStateCount;
335 
336  if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
337  kEigenValuesSize = 2 * kPaddedStateCount;
338  else
339  kEigenValuesSize = kPaddedStateCount;
340 
341  kLastCompactBufferIndex = -1;
342  kLastTipPartialsBufferIndex = -1;
343 
344  gpu = new GPUInterface();
345 
346  gpu->Initialize();
347 
348  int numDevices = 0;
349  numDevices = gpu->GetDeviceCount();
350  if (numDevices == 0) {
351  fprintf(stderr, "Error: No GPU devices\n");
352  return BEAGLE_ERROR_NO_RESOURCE;
353  }
354  if (resourceNumber > numDevices) {
355  fprintf(stderr,"Error: Trying to initialize device # %d (which does not exist)\n",resourceNumber);
356  return BEAGLE_ERROR_NO_RESOURCE;
357  }
358 
359  // TODO: recompiling kernels for every instance, probably not ideal
360  gpu->SetDevice(resourceNumber-1,kPaddedStateCount,kCategoryCount,kPaddedPatternCount,kFlags);
361 
362  int ptrQueueLength = kMatrixCount * kCategoryCount * 3;
363  if (kPartialsBufferCount > ptrQueueLength)
364  ptrQueueLength = kPartialsBufferCount;
365 
366  unsigned int neededMemory = sizeof(Real) * (kMatrixSize * kEigenDecompCount + // dEvec
367  kMatrixSize * kEigenDecompCount + // dIevc
368  kEigenValuesSize * kEigenDecompCount + // dEigenValues
369  kCategoryCount * kPartialsBufferCount + // dWeights
370  kPaddedStateCount * kPartialsBufferCount + // dFrequencies
371  kPaddedPatternCount + // dIntegrationTmp
372  kPaddedPatternCount + // dOutFirstDeriv
373  kPaddedPatternCount + // dOutSecondDeriv
374  kPartialsSize + // dPartialsTmp
375  kPartialsSize + // dFirstDerivTmp
376  kPartialsSize + // dSecondDerivTmp
377  kScaleBufferCount * kPaddedPatternCount + // dScalingFactors
378  kPartialsBufferCount * kPartialsSize + // dTipPartialsBuffers + dPartials
379  kMatrixCount * kMatrixSize * kCategoryCount + // dMatrices
380  kBufferCount + // dBranchLengths
381  kMatrixCount * kCategoryCount * 2) + // dDistanceQueue
382  sizeof(int) * kCompactBufferCount * kPaddedPatternCount + // dCompactBuffers
383  sizeof(GPUPtr) * ptrQueueLength; // dPtrQueue
384 
385 
386  unsigned int availableMem = gpu->GetAvailableMemory();
387 
388 #ifdef BEAGLE_DEBUG_VALUES
389  fprintf(stderr, " needed memory: %d\n", neededMemory);
390  fprintf(stderr, " available memory: %d\n", availableMem);
391 #endif
392 
393  if (availableMem < neededMemory)
394  return BEAGLE_ERROR_OUT_OF_MEMORY;
395 
396  kernels = new KernelLauncher(gpu);
397 
398  // TODO: only allocate if necessary on the fly
399  hWeightsCache = (Real*) gpu->CallocHost(kCategoryCount * kPartialsBufferCount, sizeof(Real));
400  hFrequenciesCache = (Real*) gpu->CallocHost(kPaddedStateCount * kPartialsBufferCount, sizeof(Real));
401  hPartialsCache = (Real*) gpu->CallocHost(kPartialsSize, sizeof(Real));
402  hStatesCache = (int*) gpu->CallocHost(kPaddedPatternCount, sizeof(int));
403 
404  int hMatrixCacheSize = kMatrixSize * kCategoryCount * BEAGLE_CACHED_MATRICES_COUNT;
405  if ((2 * kMatrixSize + kEigenValuesSize) > hMatrixCacheSize)
406  hMatrixCacheSize = 2 * kMatrixSize + kEigenValuesSize;
407 
408  hLogLikelihoodsCache = (Real*) gpu->MallocHost(kPatternCount * sizeof(Real));
409  hMatrixCache = (Real*) gpu->CallocHost(hMatrixCacheSize, sizeof(Real));
410 
411  dEvec = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
412  dIevc = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
413  dEigenValues = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
414  dWeights = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
415  dFrequencies = (GPUPtr*) calloc(sizeof(GPUPtr),kEigenDecompCount);
416 
417  dMatrices = (GPUPtr*) malloc(sizeof(GPUPtr) * kMatrixCount);
418  dMatrices[0] = gpu->AllocateMemory(kMatrixCount * kMatrixSize * kCategoryCount * sizeof(Real));
419 
420  for (int i = 1; i < kMatrixCount; i++) {
421  dMatrices[i] = dMatrices[i-1] + kMatrixSize * kCategoryCount * sizeof(Real);
422  }
423 
424  if (kScaleBufferCount > 0) {
425  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
426  dScalingFactors = (GPUPtr*) malloc(sizeof(GPUPtr) * kScaleBufferCount);
427  dScalingFactors[0] = gpu->AllocateMemory(sizeof(signed char) * kScaleBufferSize * kScaleBufferCount); // TODO: char won't work for double-precision
428  for (int i=1; i < kScaleBufferCount; i++)
429  dScalingFactors[i] = dScalingFactors[i-1] + kScaleBufferSize * sizeof(signed char);
430  } else if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
431  dScalingFactors = (GPUPtr*) calloc(sizeof(GPUPtr), kScaleBufferCount);
432  dScalingFactorsMaster = (GPUPtr*) calloc(sizeof(GPUPtr), kScaleBufferCount);
433  hRescalingTrigger = (int*) gpu->AllocatePinnedHostMemory(sizeof(int), false, true);
434  dRescalingTrigger = gpu->GetDevicePointer((void*) hRescalingTrigger);
435  } else {
436  dScalingFactors = (GPUPtr*) malloc(sizeof(GPUPtr) * kScaleBufferCount);
437  dScalingFactors[0] = gpu->AllocateMemory(kScaleBufferSize * kScaleBufferCount * sizeof(Real));
438  for (int i=1; i < kScaleBufferCount; i++) {
439  dScalingFactors[i] = dScalingFactors[i-1] + kScaleBufferSize * sizeof(Real);
440  }
441  }
442  }
443 
444  for(int i=0; i<kEigenDecompCount; i++) {
445  dEvec[i] = gpu->AllocateMemory(kMatrixSize * sizeof(Real));
446  dIevc[i] = gpu->AllocateMemory(kMatrixSize * sizeof(Real));
447  dEigenValues[i] = gpu->AllocateMemory(kEigenValuesSize * sizeof(Real));
448  dWeights[i] = gpu->AllocateMemory(kCategoryCount * sizeof(Real));
449  dFrequencies[i] = gpu->AllocateMemory(kPaddedStateCount * sizeof(Real));
450  }
451 
452 
453  dIntegrationTmp = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) * sizeof(Real));
454  dOutFirstDeriv = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) * sizeof(Real));
455  dOutSecondDeriv = gpu->AllocateMemory((kPaddedPatternCount + resultPaddedPatterns) * sizeof(Real));
456 
457  dPatternWeights = gpu->AllocateMemory(kPatternCount * sizeof(Real));
458 
459  dSumLogLikelihood = gpu->AllocateMemory(kSumSitesBlockCount * sizeof(Real));
460  dSumFirstDeriv = gpu->AllocateMemory(kSumSitesBlockCount * sizeof(Real));
461  dSumSecondDeriv = gpu->AllocateMemory(kSumSitesBlockCount * sizeof(Real));
462 
463  dPartialsTmp = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
464  dFirstDerivTmp = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
465  dSecondDerivTmp = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
466 
467  // Fill with 0s so 'free' does not choke if unallocated
468  dPartials = (GPUPtr*) calloc(sizeof(GPUPtr), kBufferCount);
469 
470  // Internal nodes have 0s so partials are used
471  dStates = (GPUPtr*) calloc(sizeof(GPUPtr), kBufferCount);
472 
473  dCompactBuffers = (GPUPtr*) malloc(sizeof(GPUPtr) * kCompactBufferCount);
474  dTipPartialsBuffers = (GPUPtr*) malloc(sizeof(GPUPtr) * kTipPartialsBufferCount);
475 
476  for (int i = 0; i < kBufferCount; i++) {
477  if (i < kTipCount) { // For the tips
478  if (i < kCompactBufferCount)
479  dCompactBuffers[i] = gpu->AllocateMemory(kPaddedPatternCount * sizeof(int));
480  if (i < kTipPartialsBufferCount)
481  dTipPartialsBuffers[i] = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
482  } else {
483  dPartials[i] = gpu->AllocateMemory(kPartialsSize * sizeof(Real));
484  }
485  }
486 
487  kLastCompactBufferIndex = kCompactBufferCount - 1;
488  kLastTipPartialsBufferIndex = kTipPartialsBufferCount - 1;
489 
490  // No execution has more no kBufferCount events
491  dBranchLengths = gpu->AllocateMemory(kBufferCount * sizeof(Real));
492 
493  dDistanceQueue = gpu->AllocateMemory(kMatrixCount * kCategoryCount * 2 * sizeof(Real));
494  hDistanceQueue = (Real*) gpu->MallocHost(sizeof(Real) * kMatrixCount * kCategoryCount * 2);
495  checkHostMemory(hDistanceQueue);
496 
497  dPtrQueue = gpu->AllocateMemory(sizeof(unsigned int) * ptrQueueLength);
498  hPtrQueue = (unsigned int*) gpu->MallocHost(sizeof(unsigned int) * ptrQueueLength);
499  checkHostMemory(hPtrQueue);
500 
501  hCategoryRates = (double*) gpu->MallocHost(sizeof(double) * kCategoryCount); // Keep in double-precision
502  checkHostMemory(hCategoryRates);
503 
504  hPatternWeightsCache = (Real*) gpu->MallocHost(sizeof(double) * kPatternCount);
505  checkHostMemory(hPatternWeightsCache);
506 
507  dMaxScalingFactors = gpu->AllocateMemory(kPaddedPatternCount * sizeof(Real));
508  dIndexMaxScalingFactors = gpu->AllocateMemory(kPaddedPatternCount * sizeof(int));
509 
510  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
511  dAccumulatedScalingFactors = gpu->AllocateMemory(sizeof(int) * kScaleBufferSize);
512  }
513 
514 #ifdef BEAGLE_DEBUG_FLOW
515  fprintf(stderr, "\tLeaving BeagleGPUImpl::createInstance\n");
516 #endif
517 
518  kInitialized = 1;
519 
520 #ifdef BEAGLE_DEBUG_VALUES
521  gpu->Synchronize();
522  int usedMemory = availableMem - gpu->GetAvailableMemory();
523  fprintf(stderr, "actual used memory: %d\n", usedMemory);
524  fprintf(stderr, " difference: %d\n\n", usedMemory-neededMemory);
525 #endif
526 
527  return BEAGLE_SUCCESS;
528 }
529 
530 template<>
531 char* BeagleGPUImpl<double>::getInstanceName() {
532  return (char*) "CUDA-Double";
533 }
534 
535 template<>
536 char* BeagleGPUImpl<float>::getInstanceName() {
537  return (char*) "CUDA-Single";
538 }
539 
540 BEAGLE_GPU_TEMPLATE
541 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getInstanceDetails(BeagleInstanceDetails* returnInfo) {
542  if (returnInfo != NULL) {
543  returnInfo->resourceNumber = resourceNumber;
544  returnInfo->flags = BEAGLE_FLAG_COMPUTATION_SYNCH |
545  BEAGLE_FLAG_THREADING_NONE |
546  BEAGLE_FLAG_VECTOR_NONE |
547  BEAGLE_FLAG_PROCESSOR_GPU;
548  Real r = 0;
549  modifyFlagsForPrecision(&(returnInfo->flags), r);
550 
551  returnInfo->flags |= kFlags;
552 
553  returnInfo->implName = getInstanceName();
554  }
555  return BEAGLE_SUCCESS;
556 }
557 
558 BEAGLE_GPU_TEMPLATE
559 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTipStates(int tipIndex,
560  const int* inStates) {
561 
562 #ifdef BEAGLE_DEBUG_FLOW
563  fprintf(stderr, "\tEntering BeagleGPUImpl::setTipStates\n");
564 #endif
565 
566  if (tipIndex < 0 || tipIndex >= kTipCount)
567  return BEAGLE_ERROR_OUT_OF_RANGE;
568 
569  for(int i = 0; i < kPatternCount; i++)
570  hStatesCache[i] = (inStates[i] < kStateCount ? inStates[i] : kPaddedStateCount);
571 
572  // Padded extra patterns
573  for(int i = kPatternCount; i < kPaddedPatternCount; i++)
574  hStatesCache[i] = kPaddedStateCount;
575 
576  if (dStates[tipIndex] == 0) {
577  assert(kLastCompactBufferIndex >= 0 && kLastCompactBufferIndex < kCompactBufferCount);
578  dStates[tipIndex] = dCompactBuffers[kLastCompactBufferIndex--];
579  }
580  // Copy to GPU device
581  gpu->MemcpyHostToDevice(dStates[tipIndex], hStatesCache, sizeof(int) * kPaddedPatternCount);
582 
583 #ifdef BEAGLE_DEBUG_FLOW
584  fprintf(stderr, "\tLeaving BeagleGPUImpl::setTipStates\n");
585 #endif
586 
587  return BEAGLE_SUCCESS;
588 }
589 
590 BEAGLE_GPU_TEMPLATE
591 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTipPartials(int tipIndex,
592  const double* inPartials) {
593 #ifdef BEAGLE_DEBUG_FLOW
594  fprintf(stderr, "\tEntering BeagleGPUImpl::setTipPartials\n");
595 #endif
596 
597  if (tipIndex < 0 || tipIndex >= kTipCount)
598  return BEAGLE_ERROR_OUT_OF_RANGE;
599 
600  const double* inPartialsOffset = inPartials;
601  Real* tmpRealPartialsOffset = hPartialsCache;
602  for (int i = 0; i < kPatternCount; i++) {
603 //#ifdef DOUBLE_PRECISION
604 // memcpy(tmpRealPartialsOffset, inPartialsOffset, sizeof(Real) * kStateCount);
605 //#else
606 // MEMCNV(tmpRealPartialsOffset, inPartialsOffset, kStateCount, Real);
607 //#endif
608  beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
609  tmpRealPartialsOffset += kPaddedStateCount;
610  inPartialsOffset += kStateCount;
611  }
612 
613  int partialsLength = kPaddedPatternCount * kPaddedStateCount;
614  for (int i = 1; i < kCategoryCount; i++) {
615  memcpy(hPartialsCache + i * partialsLength, hPartialsCache, partialsLength * sizeof(Real));
616  }
617 
618  if (tipIndex < kTipCount) {
619  if (dPartials[tipIndex] == 0) {
620  assert(kLastTipPartialsBufferIndex >= 0 && kLastTipPartialsBufferIndex <
621  kTipPartialsBufferCount);
622  dPartials[tipIndex] = dTipPartialsBuffers[kLastTipPartialsBufferIndex--];
623  }
624  }
625  // Copy to GPU device
626  gpu->MemcpyHostToDevice(dPartials[tipIndex], hPartialsCache, sizeof(Real) * kPartialsSize);
627 
628 #ifdef BEAGLE_DEBUG_FLOW
629  fprintf(stderr, "\tLeaving BeagleGPUImpl::setTipPartials\n");
630 #endif
631 
632  return BEAGLE_SUCCESS;
633 }
634 
635 BEAGLE_GPU_TEMPLATE
636 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setPartials(int bufferIndex,
637  const double* inPartials) {
638 #ifdef BEAGLE_DEBUG_FLOW
639  fprintf(stderr, "\tEntering BeagleGPUImpl::setPartials\n");
640 #endif
641 
642  if (bufferIndex < 0 || bufferIndex >= kPartialsBufferCount)
643  return BEAGLE_ERROR_OUT_OF_RANGE;
644 
645  const double* inPartialsOffset = inPartials;
646  Real* tmpRealPartialsOffset = hPartialsCache;
647  for (int l = 0; l < kCategoryCount; l++) {
648  for (int i = 0; i < kPatternCount; i++) {
649 //#ifdef DOUBLE_PRECISION
650 // memcpy(tmpRealPartialsOffset, inPartialsOffset, sizeof(Real) * kStateCount);
651 //#else
652 // MEMCNV(tmpRealPartialsOffset, inPartialsOffset, kStateCount, Real);
653 //#endif
654  beagleMemCpy(tmpRealPartialsOffset, inPartialsOffset, kStateCount);
655  tmpRealPartialsOffset += kPaddedStateCount;
656  inPartialsOffset += kStateCount;
657  }
658  tmpRealPartialsOffset += kPaddedStateCount * (kPaddedPatternCount - kPatternCount);
659  }
660 
661  if (bufferIndex < kTipCount) {
662  if (dPartials[bufferIndex] == 0) {
663  assert(kLastTipPartialsBufferIndex >= 0 && kLastTipPartialsBufferIndex <
664  kTipPartialsBufferCount);
665  dPartials[bufferIndex] = dTipPartialsBuffers[kLastTipPartialsBufferIndex--];
666  }
667  }
668  // Copy to GPU device
669  gpu->MemcpyHostToDevice(dPartials[bufferIndex], hPartialsCache, sizeof(Real) * kPartialsSize);
670 
671 #ifdef BEAGLE_DEBUG_FLOW
672  fprintf(stderr, "\tLeaving BeagleGPUImpl::setPartials\n");
673 #endif
674 
675  return BEAGLE_SUCCESS;
676 }
677 
678 BEAGLE_GPU_TEMPLATE
679 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getPartials(int bufferIndex,
680  int scaleIndex,
681  double* outPartials) {
682 #ifdef BEAGLE_DEBUG_FLOW
683  fprintf(stderr, "\tEntering BeagleGPUImpl::getPartials\n");
684 #endif
685 
686  gpu->MemcpyDeviceToHost(hPartialsCache, dPartials[bufferIndex], sizeof(Real) * kPartialsSize);
687 
688  double* outPartialsOffset = outPartials;
689  Real* tmpRealPartialsOffset = hPartialsCache;
690 
691  for (int i = 0; i < kPatternCount; i++) {
692 //#ifdef DOUBLE_PRECISION
693 // memcpy(outPartialsOffset, tmpRealPartialsOffset, sizeof(Real) * kStateCount);
694 //#else
695 // MEMCNV(outPartialsOffset, tmpRealPartialsOffset, kStateCount, double);
696 //#endif
697  beagleMemCpy(outPartialsOffset, tmpRealPartialsOffset, kStateCount);
698  tmpRealPartialsOffset += kPaddedStateCount;
699  outPartialsOffset += kStateCount;
700  }
701 
702 #ifdef BEAGLE_DEBUG_FLOW
703  fprintf(stderr, "\tLeaving BeagleGPUImpl::getPartials\n");
704 #endif
705 
706  return BEAGLE_SUCCESS;
707 }
708 
709 BEAGLE_GPU_TEMPLATE
710 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setEigenDecomposition(int eigenIndex,
711  const double* inEigenVectors,
712  const double* inInverseEigenVectors,
713  const double* inEigenValues) {
714 
715 #ifdef BEAGLE_DEBUG_FLOW
716  fprintf(stderr,"\tEntering BeagleGPUImpl::setEigenDecomposition\n");
717 #endif
718 
719  // Native memory packing order (length): Ievc (state^2), Evec (state^2),
720  // Eval (state), EvalImag (state)
721 
722  Real* Ievc, * tmpIevc, * Evec, * tmpEvec, * Eval;
723 
724  tmpIevc = Ievc = (Real*) hMatrixCache;
725  tmpEvec = Evec = Ievc + kMatrixSize;
726  Eval = Evec + kMatrixSize;
727 
728  for (int i = 0; i < kStateCount; i++) {
729 //#ifdef DOUBLE_PRECISION
730 // memcpy(tmpIevc, inInverseEigenVectors + i * kStateCount, sizeof(Real) * kStateCount);
731 // memcpy(tmpEvec, inEigenVectors + i * kStateCount, sizeof(Real) * kStateCount);
732 //#else
733 // MEMCNV(tmpIevc, (inInverseEigenVectors + i * kStateCount), kStateCount, Real);
734 // MEMCNV(tmpEvec, (inEigenVectors + i * kStateCount), kStateCount, Real);
735 //#endif
736  beagleMemCpy(tmpIevc, inInverseEigenVectors + i * kStateCount, kStateCount);
737  beagleMemCpy(tmpEvec, inEigenVectors + i * kStateCount, kStateCount);
738  tmpIevc += kPaddedStateCount;
739  tmpEvec += kPaddedStateCount;
740  }
741 
742  // Transposing matrices avoids incoherent memory read/writes
743  // TODO: Only need to tranpose sub-matrix of trueStateCount
744  if (kFlags & BEAGLE_FLAG_INVEVEC_STANDARD)
745  transposeSquareMatrix(Ievc, kPaddedStateCount);
746  transposeSquareMatrix(Evec, kPaddedStateCount);
747 
748 //#ifdef DOUBLE_PRECISION
749 // memcpy(Eval, inEigenValues, sizeof(Real) * kStateCount);
750 // if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
751 // memcpy(Eval+kPaddedStateCount,inEigenValues+kStateCount,sizeof(Real)*kStateCount);
752 //#else
753 // MEMCNV(Eval, inEigenValues, kStateCount, Real);
754 // if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX)
755 // MEMCNV((Eval+kPaddedStateCount),(inEigenValues+kStateCount),kStateCount,Real);
756 //#endif
757  beagleMemCpy(Eval, inEigenValues, kStateCount);
758  if (kFlags & BEAGLE_FLAG_EIGEN_COMPLEX) {
759  beagleMemCpy(Eval + kPaddedStateCount, inEigenValues + kStateCount, kStateCount);
760  }
761 
762 #ifdef BEAGLE_DEBUG_VALUES
763 //#ifdef DOUBLE_PRECISION
764 // fprintf(stderr, "Eval:\n");
765 // printfVectorD(Eval, kEigenValuesSize);
766 // fprintf(stderr, "Evec:\n");
767 // printfVectorD(Evec, kMatrixSize);
768 // fprintf(stderr, "Ievc:\n");
769 // printfVectorD(Ievc, kPaddedStateCount * kPaddedStateCount);
770 //#else
771  fprintf(stderr, "Eval =\n");
772  printfVector(Eval, kEigenValuesSize);
773  fprintf(stderr, "Evec =\n");
774  printfVector(Evec, kMatrixSize);
775  fprintf(stderr, "Ievc =\n");
776  printfVector(Ievc, kPaddedStateCount * kPaddedStateCount);
777 //#endif
778 #endif
779 
780  // Copy to GPU device
781  gpu->MemcpyHostToDevice(dIevc[eigenIndex], Ievc, sizeof(Real) * kMatrixSize);
782  gpu->MemcpyHostToDevice(dEvec[eigenIndex], Evec, sizeof(Real) * kMatrixSize);
783  gpu->MemcpyHostToDevice(dEigenValues[eigenIndex], Eval, sizeof(Real) * kEigenValuesSize);
784 
785 #ifdef BEAGLE_DEBUG_VALUES
786  Real r = 0;
787  fprintf(stderr, "dEigenValues =\n");
788  gpu->PrintfDeviceVector(dEigenValues[eigenIndex], kEigenValuesSize, r);
789  fprintf(stderr, "dEvec =\n");
790  gpu->PrintfDeviceVector(dEvec[eigenIndex], kMatrixSize, r);
791  fprintf(stderr, "dIevc =\n");
792  gpu->PrintfDeviceVector(dIevc[eigenIndex], kPaddedStateCount * kPaddedStateCount, r);
793 #endif
794 
795 #ifdef BEAGLE_DEBUG_FLOW
796  fprintf(stderr, "\tLeaving BeagleGPUImpl::setEigenDecomposition\n");
797 #endif
798 
799  return BEAGLE_SUCCESS;
800 }
801 
802 BEAGLE_GPU_TEMPLATE
803 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setStateFrequencies(int stateFrequenciesIndex,
804  const double* inStateFrequencies) {
805 #ifdef BEAGLE_DEBUG_FLOW
806  fprintf(stderr, "\tEntering BeagleGPUImpl::setStateFrequencies\n");
807 #endif
808 
809  if (stateFrequenciesIndex < 0 || stateFrequenciesIndex >= kEigenDecompCount)
810  return BEAGLE_ERROR_OUT_OF_RANGE;
811 
812 //#ifdef DOUBLE_PRECISION
813 // memcpy(hFrequenciesCache, inStateFrequencies, kStateCount * sizeof(Real));
814 //#else
815 // MEMCNV(hFrequenciesCache, inStateFrequencies, kStateCount, Real);
816 //#endif
817  beagleMemCpy(hFrequenciesCache, inStateFrequencies, kStateCount);
818 
819  gpu->MemcpyHostToDevice(dFrequencies[stateFrequenciesIndex], hFrequenciesCache,
820  sizeof(Real) * kPaddedStateCount);
821 
822 #ifdef BEAGLE_DEBUG_FLOW
823  fprintf(stderr, "\tLeaving BeagleGPUImpl::setStateFrequencies\n");
824 #endif
825 
826  return BEAGLE_SUCCESS;
827 }
828 
829 BEAGLE_GPU_TEMPLATE
830 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setCategoryWeights(int categoryWeightsIndex,
831  const double* inCategoryWeights) {
832 #ifdef BEAGLE_DEBUG_FLOW
833  fprintf(stderr, "\tEntering BeagleGPUImpl::setCategoryWeights\n");
834 #endif
835 
836  if (categoryWeightsIndex < 0 || categoryWeightsIndex >= kEigenDecompCount)
837  return BEAGLE_ERROR_OUT_OF_RANGE;
838 
839 //#ifdef DOUBLE_PRECISION
840 // const double* tmpWeights = inCategoryWeights;
841 //#else
842 // Real* tmpWeights = hWeightsCache;
843 // MEMCNV(hWeightsCache, inCategoryWeights, kCategoryCount, Real);
844 //#endif
845  const Real* tmpWeights = beagleCastIfNecessary(inCategoryWeights, hWeightsCache,
846  kCategoryCount);
847 
848  gpu->MemcpyHostToDevice(dWeights[categoryWeightsIndex], tmpWeights,
849  sizeof(Real) * kCategoryCount);
850 
851 #ifdef BEAGLE_DEBUG_FLOW
852  fprintf(stderr, "\tLeaving BeagleGPUImpl::setCategoryWeights\n");
853 #endif
854 
855  return BEAGLE_SUCCESS;
856 }
857 
858 BEAGLE_GPU_TEMPLATE
859 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setCategoryRates(const double* inCategoryRates) {
860 
861 #ifdef BEAGLE_DEBUG_FLOW
862  fprintf(stderr, "\tEntering BeagleGPUImpl::updateCategoryRates\n");
863 #endif
864 
865  const double* categoryRates = inCategoryRates;
866  // Can keep these in double-precision until after multiplication by (double) branch-length
867 
868  memcpy(hCategoryRates, categoryRates, sizeof(double) * kCategoryCount);
869 
870 #ifdef BEAGLE_DEBUG_FLOW
871  fprintf(stderr, "\tLeaving BeagleGPUImpl::updateCategoryRates\n");
872 #endif
873 
874  return BEAGLE_SUCCESS;
875 }
876 
877 BEAGLE_GPU_TEMPLATE
878 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setPatternWeights(const double* inPatternWeights) {
879 
880 #ifdef BEAGLE_DEBUG_FLOW
881  fprintf(stderr, "\tEntering BeagleGPUImpl::setPatternWeights\n");
882 #endif
883 
884 //#ifdef DOUBLE_PRECISION
885 // const double* tmpWeights = inPatternWeights;
886 //#else
887 // Real* tmpWeights = hPatternWeightsCache;
888 // MEMCNV(hPatternWeightsCache, inPatternWeights, kPatternCount, Real);
889 //#endif
890  const Real* tmpWeights = beagleCastIfNecessary(inPatternWeights, hPatternWeightsCache, kPatternCount);
891 
892  gpu->MemcpyHostToDevice(dPatternWeights, tmpWeights,
893  sizeof(Real) * kPatternCount);
894 
895 #ifdef BEAGLE_DEBUG_FLOW
896  fprintf(stderr, "\tLeaving BeagleGPUImpl::setPatternWeights\n");
897 #endif
898 
899  return BEAGLE_SUCCESS;
900 }
901 
902 BEAGLE_GPU_TEMPLATE
903 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::getTransitionMatrix(int matrixIndex,
904  double* outMatrix) {
905 #ifdef BEAGLE_DEBUG_FLOW
906  fprintf(stderr, "\tEntering BeagleGPUImpl::getTransitionMatrix\n");
907 #endif
908 
909  gpu->MemcpyDeviceToHost(hMatrixCache, dMatrices[matrixIndex], sizeof(Real) * kMatrixSize * kCategoryCount);
910 
911  double* outMatrixOffset = outMatrix;
912  Real* tmpRealMatrixOffset = hMatrixCache;
913 
914  for (int l = 0; l < kCategoryCount; l++) {
915 
916  transposeSquareMatrix(tmpRealMatrixOffset, kPaddedStateCount);
917 
918  for (int i = 0; i < kStateCount; i++) {
919 //#ifdef DOUBLE_PRECISION
920 // memcpy(outMatrixOffset, tmpRealMatrixOffset, sizeof(Real) * kStateCount);
921 //#else
922 // MEMCNV(outMatrixOffset, tmpRealMatrixOffset, kStateCount, double);
923 //#endif
924  beagleMemCpy(outMatrixOffset, tmpRealMatrixOffset, kStateCount);
925  tmpRealMatrixOffset += kPaddedStateCount;
926  outMatrixOffset += kStateCount;
927  }
928  tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
929  }
930 
931 #ifdef BEAGLE_DEBUG_FLOW
932  fprintf(stderr, "\tLeaving BeagleGPUImpl::getTransitionMatrix\n");
933 #endif
934 
935  return BEAGLE_SUCCESS;
936 }
937 
938 BEAGLE_GPU_TEMPLATE
939 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTransitionMatrix(int matrixIndex,
940  const double* inMatrix,
941  double paddedValue) {
942 
943 #ifdef BEAGLE_DEBUG_FLOW
944  fprintf(stderr, "\tEntering BeagleGPUImpl::setTransitionMatrix\n");
945 #endif
946 
947  const double* inMatrixOffset = inMatrix;
948  Real* tmpRealMatrixOffset = hMatrixCache;
949 
950  for (int l = 0; l < kCategoryCount; l++) {
951  Real* transposeOffset = tmpRealMatrixOffset;
952 
953  for (int i = 0; i < kStateCount; i++) {
954 //#ifdef DOUBLE_PRECISION
955 // memcpy(tmpRealMatrixOffset, inMatrixOffset, sizeof(Real) * kStateCount);
956 //#else
957 // MEMCNV(tmpRealMatrixOffset, inMatrixOffset, kStateCount, Real);
958 //#endif
959  beagleMemCpy(tmpRealMatrixOffset, inMatrixOffset, kStateCount);
960  tmpRealMatrixOffset += kPaddedStateCount;
961  inMatrixOffset += kStateCount;
962  }
963 
964  transposeSquareMatrix(transposeOffset, kPaddedStateCount);
965  tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
966  }
967 
968  // Copy to GPU device
969  gpu->MemcpyHostToDevice(dMatrices[matrixIndex], hMatrixCache,
970  sizeof(Real) * kMatrixSize * kCategoryCount);
971 
972 #ifdef BEAGLE_DEBUG_FLOW
973  fprintf(stderr, "\tLeaving BeagleGPUImpl::setTransitionMatrix\n");
974 #endif
975 
976  return BEAGLE_SUCCESS;
977 }
978 
979 BEAGLE_GPU_TEMPLATE
980 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::setTransitionMatrices(const int* matrixIndices,
981  const double* inMatrices,
982  const double* paddedValues,
983  int count) {
984 
985 #ifdef BEAGLE_DEBUG_FLOW
986  fprintf(stderr, "\tEntering BeagleGPUImpl::setTransitionMatrices\n");
987 #endif
988 
989  int k = 0;
990  while (k < count) {
991  const double* inMatrixOffset = inMatrices + k*kStateCount*kStateCount*kCategoryCount;
992  Real* tmpRealMatrixOffset = hMatrixCache;
993  int lumpedMatricesCount = 0;
994  int matrixIndex = matrixIndices[k];
995 
996  do {
997  for (int l = 0; l < kCategoryCount; l++) {
998  Real* transposeOffset = tmpRealMatrixOffset;
999 
1000  for (int i = 0; i < kStateCount; i++) {
1001 // #ifdef DOUBLE_PRECISION
1002 // memcpy(tmpRealMatrixOffset, inMatrixOffset, sizeof(Real) * kStateCount);
1003 // #else
1004 // MEMCNV(tmpRealMatrixOffset, inMatrixOffset, kStateCount, Real);
1005 // #endif
1006  beagleMemCpy(tmpRealMatrixOffset, inMatrixOffset, kStateCount);
1007  tmpRealMatrixOffset += kPaddedStateCount;
1008  inMatrixOffset += kStateCount;
1009  }
1010 
1011  transposeSquareMatrix(transposeOffset, kPaddedStateCount);
1012  tmpRealMatrixOffset += (kPaddedStateCount - kStateCount) * kPaddedStateCount;
1013  }
1014 
1015  lumpedMatricesCount++;
1016  k++;
1017  } while ((k < count) && (matrixIndices[k] == matrixIndices[k-1] + 1) && (lumpedMatricesCount < BEAGLE_CACHED_MATRICES_COUNT));
1018 
1019  // Copy to GPU device
1020  gpu->MemcpyHostToDevice(dMatrices[matrixIndex], hMatrixCache,
1021  sizeof(Real) * kMatrixSize * kCategoryCount * lumpedMatricesCount);
1022 
1023  }
1024 
1025 #ifdef BEAGLE_DEBUG_FLOW
1026  fprintf(stderr, "\tLeaving BeagleGPUImpl::setTransitionMatrices\n");
1027 #endif
1028 
1029  return BEAGLE_SUCCESS;
1030 }
1031 
1032 BEAGLE_GPU_TEMPLATE
1033 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::updateTransitionMatrices(int eigenIndex,
1034  const int* probabilityIndices,
1035  const int* firstDerivativeIndices,
1036  const int* secondDerivativeIndices,
1037  const double* edgeLengths,
1038  int count) {
1039 #ifdef BEAGLE_DEBUG_FLOW
1040  fprintf(stderr,"\tEntering BeagleGPUImpl::updateTransitionMatrices\n");
1041 #endif
1042  if (count > 0) {
1043  // TODO: improve performance of calculation of derivatives
1044  int totalCount = 0;
1045 
1046  int indexOffset = kMatrixSize * kCategoryCount;
1047  int categoryOffset = kMatrixSize;
1048 
1049  #ifdef CUDA
1050 
1051  if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
1052  for (int i = 0; i < count; i++) {
1053  for (int j = 0; j < kCategoryCount; j++) {
1054  hPtrQueue[totalCount] = probabilityIndices[i] * indexOffset + j * categoryOffset;
1055  hDistanceQueue[totalCount] = (Real) (edgeLengths[i] * hCategoryRates[j]);
1056  totalCount++;
1057  }
1058  }
1059 
1060  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * totalCount);
1061  gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount);
1062 
1063  // Set-up and call GPU kernel
1064  kernels->GetTransitionProbabilitiesSquare(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
1065  dEigenValues[eigenIndex], dDistanceQueue, totalCount);
1066  } else if (secondDerivativeIndices == NULL) {
1067 
1068  totalCount = count * kCategoryCount;
1069  int ptrIndex = 0;
1070  for (int i = 0; i < count; i++) {
1071  for (int j = 0; j < kCategoryCount; j++) {
1072  hPtrQueue[ptrIndex] = probabilityIndices[i] * indexOffset + j * categoryOffset;
1073  hPtrQueue[ptrIndex + totalCount] = firstDerivativeIndices[i] * indexOffset + j * categoryOffset;
1074  hDistanceQueue[ptrIndex] = (Real) (edgeLengths[i]);
1075  hDistanceQueue[ptrIndex + totalCount] = (Real) (hCategoryRates[j]);
1076  ptrIndex++;
1077  }
1078  }
1079 
1080  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * totalCount * 2);
1081  gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount * 2);
1082 
1083  kernels->GetTransitionProbabilitiesSquareFirstDeriv(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
1084  dEigenValues[eigenIndex], dDistanceQueue, totalCount);
1085 
1086  } else {
1087  totalCount = count * kCategoryCount;
1088  int ptrIndex = 0;
1089  for (int i = 0; i < count; i++) {
1090  for (int j = 0; j < kCategoryCount; j++) {
1091  hPtrQueue[ptrIndex] = probabilityIndices[i] * indexOffset + j * categoryOffset;
1092  hPtrQueue[ptrIndex + totalCount] = firstDerivativeIndices[i] * indexOffset + j * categoryOffset;
1093  hPtrQueue[ptrIndex + totalCount*2] = secondDerivativeIndices[i] * indexOffset + j * categoryOffset;
1094  hDistanceQueue[ptrIndex] = (Real) (edgeLengths[i]);
1095  hDistanceQueue[ptrIndex + totalCount] = (Real) (hCategoryRates[j]);
1096  ptrIndex++;
1097  }
1098  }
1099 
1100  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * totalCount * 3);
1101  gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount * 2);
1102 
1103  kernels->GetTransitionProbabilitiesSquareSecondDeriv(dMatrices[0], dPtrQueue, dEvec[eigenIndex], dIevc[eigenIndex],
1104  dEigenValues[eigenIndex], dDistanceQueue, totalCount);
1105  }
1106 
1107 
1108  #else
1109  // TODO: update OpenCL implementation with derivs
1110 
1111  for (int i = 0; i < count; i++) {
1112  for (int j = 0; j < kCategoryCount; j++) {
1113  hDistanceQueue[totalCount] = (Real) (edgeLengths[i] * hCategoryRates[j]);
1114  totalCount++;
1115  }
1116  }
1117 
1118  gpu->MemcpyHostToDevice(dDistanceQueue, hDistanceQueue, sizeof(Real) * totalCount);
1119 
1120  // Set-up and call GPU kernel
1121  for (int i = 0; i < count; i++) {
1122  kernels->GetTransitionProbabilitiesSquare(dMatrices[probabilityIndices[i]], dEvec[eigenIndex], dIevc[eigenIndex],
1123  dEigenValues[eigenIndex], dDistanceQueue, kCategoryCount,
1124  i * kCategoryCount);
1125  }
1126 
1127  #endif
1128 
1129  #ifdef BEAGLE_DEBUG_VALUES
1130  Real r = 0;
1131  for (int i = 0; i < 1; i++) {
1132  fprintf(stderr, "dMatrices[probabilityIndices[%d]] (hDQ = %1.5e, eL = %1.5e) =\n", i,hDistanceQueue[i], edgeLengths[i]);
1133  gpu->PrintfDeviceVector(dMatrices[probabilityIndices[i]], kMatrixSize * kCategoryCount, r);
1134  for(int j=0; j<kCategoryCount; j++)
1135  fprintf(stderr, " %1.5f",hCategoryRates[j]);
1136  fprintf(stderr,"\n");
1137  }
1138  #endif
1139 
1140  #ifdef BEAGLE_DEBUG_SYNCH
1141  gpu->Synchronize();
1142  #endif
1143  }
1144 #ifdef BEAGLE_DEBUG_FLOW
1145  fprintf(stderr, "\tLeaving BeagleGPUImpl::updateTransitionMatrices\n");
1146 #endif
1147 
1148  return BEAGLE_SUCCESS;
1149 }
1150 
1151 BEAGLE_GPU_TEMPLATE
1152 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::updatePartials(const int* operations,
1153  int operationCount,
1154  int cumulativeScalingIndex) {
1155 
1156 #ifdef BEAGLE_DEBUG_FLOW
1157  fprintf(stderr, "\tEntering BeagleGPUImpl::updatePartials\n");
1158 #endif
1159 
1160  GPUPtr cumulativeScalingBuffer = 0;
1161  if (cumulativeScalingIndex != BEAGLE_OP_NONE)
1162  cumulativeScalingBuffer = dScalingFactors[cumulativeScalingIndex];
1163 
1164  // Serial version
1165  for (int op = 0; op < operationCount; op++) {
1166  const int parIndex = operations[op * 7];
1167  const int writeScalingIndex = operations[op * 7 + 1];
1168  const int readScalingIndex = operations[op * 7 + 2];
1169  const int child1Index = operations[op * 7 + 3];
1170  const int child1TransMatIndex = operations[op * 7 + 4];
1171  const int child2Index = operations[op * 7 + 5];
1172  const int child2TransMatIndex = operations[op * 7 + 6];
1173 
1174  GPUPtr matrices1 = dMatrices[child1TransMatIndex];
1175  GPUPtr matrices2 = dMatrices[child2TransMatIndex];
1176 
1177  GPUPtr partials1 = dPartials[child1Index];
1178  GPUPtr partials2 = dPartials[child2Index];
1179 
1180  GPUPtr partials3 = dPartials[parIndex];
1181 
1182  GPUPtr tipStates1 = dStates[child1Index];
1183  GPUPtr tipStates2 = dStates[child2Index];
1184 
1185  int rescale = BEAGLE_OP_NONE;
1186  GPUPtr scalingFactors = (GPUPtr)NULL;
1187 
1188  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1189  int sIndex = parIndex - kTipCount;
1190 
1191  if (tipStates1 == 0 && tipStates2 == 0) {
1192  rescale = 2;
1193  scalingFactors = dScalingFactors[sIndex];
1194  }
1195  } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1196  rescale = 1;
1197  scalingFactors = dScalingFactors[parIndex - kTipCount];
1198  } else if ((kFlags & BEAGLE_FLAG_SCALING_MANUAL) && writeScalingIndex >= 0) {
1199  rescale = 1;
1200  scalingFactors = dScalingFactors[writeScalingIndex];
1201  } else if ((kFlags & BEAGLE_FLAG_SCALING_MANUAL) && readScalingIndex >= 0) {
1202  rescale = 0;
1203  scalingFactors = dScalingFactors[readScalingIndex];
1204  }
1205 
1206 #ifdef BEAGLE_DEBUG_VALUES
1207  fprintf(stderr, "kPaddedPatternCount = %d\n", kPaddedPatternCount);
1208  fprintf(stderr, "kPatternCount = %d\n", kPatternCount);
1209  fprintf(stderr, "categoryCount = %d\n", kCategoryCount);
1210  fprintf(stderr, "partialSize = %d\n", kPartialsSize);
1211  fprintf(stderr, "writeIndex = %d, readIndex = %d, rescale = %d\n",writeScalingIndex,readScalingIndex,rescale);
1212  fprintf(stderr, "child1 = \n");
1213  Real r = 0;
1214  if (tipStates1)
1215  gpu->PrintfDeviceInt(tipStates1, kPaddedPatternCount);
1216  else
1217  gpu->PrintfDeviceVector(partials1, kPartialsSize, r);
1218  fprintf(stderr, "child2 = \n");
1219  if (tipStates2)
1220  gpu->PrintfDeviceInt(tipStates2, kPaddedPatternCount);
1221  else
1222  gpu->PrintfDeviceVector(partials2, kPartialsSize, r);
1223  fprintf(stderr, "node index = %d\n", parIndex);
1224 #endif
1225 
1226  if (tipStates1 != 0) {
1227  if (tipStates2 != 0 ) {
1228  kernels->StatesStatesPruningDynamicScaling(tipStates1, tipStates2, partials3,
1229  matrices1, matrices2, scalingFactors,
1230  cumulativeScalingBuffer,
1231  kPaddedPatternCount, kCategoryCount,
1232  rescale);
1233  } else {
1234  kernels->StatesPartialsPruningDynamicScaling(tipStates1, partials2, partials3,
1235  matrices1, matrices2, scalingFactors,
1236  cumulativeScalingBuffer,
1237  kPaddedPatternCount, kCategoryCount,
1238  rescale);
1239  }
1240  } else {
1241  if (tipStates2 != 0) {
1242  kernels->StatesPartialsPruningDynamicScaling(tipStates2, partials1, partials3,
1243  matrices2, matrices1, scalingFactors,
1244  cumulativeScalingBuffer,
1245  kPaddedPatternCount, kCategoryCount,
1246  rescale);
1247  } else {
1248  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1249  kernels->PartialsPartialsPruningDynamicCheckScaling(partials1, partials2, partials3,
1250  matrices1, matrices2, writeScalingIndex, readScalingIndex,
1251  cumulativeScalingIndex, dScalingFactors, dScalingFactorsMaster,
1252  kPaddedPatternCount, kCategoryCount,
1253  rescale, hRescalingTrigger, dRescalingTrigger, sizeof(Real));
1254  } else {
1255  kernels->PartialsPartialsPruningDynamicScaling(partials1, partials2, partials3,
1256  matrices1, matrices2, scalingFactors,
1257  cumulativeScalingBuffer,
1258  kPaddedPatternCount, kCategoryCount,
1259  rescale);
1260  }
1261  }
1262  }
1263 
1264  if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1265  int parScalingIndex = parIndex - kTipCount;
1266  int child1ScalingIndex = child1Index - kTipCount;
1267  int child2ScalingIndex = child2Index - kTipCount;
1268  if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
1269  int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
1270  accumulateScaleFactors(scalingIndices, 2, parScalingIndex);
1271  } else if (child1ScalingIndex >= 0) {
1272  int scalingIndices[1] = {child1ScalingIndex};
1273  accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
1274  } else if (child2ScalingIndex >= 0) {
1275  int scalingIndices[1] = {child2ScalingIndex};
1276  accumulateScaleFactors(scalingIndices, 1, parScalingIndex);
1277  }
1278  }
1279 
1280 #ifdef BEAGLE_DEBUG_VALUES
1281  if (rescale > -1) {
1282  fprintf(stderr,"scalars = ");
1283  gpu->PrintfDeviceVector(scalingFactors,kPaddedPatternCount, r);
1284  }
1285  fprintf(stderr, "parent = \n");
1286  int signal = 0;
1287  if (writeScalingIndex == -1)
1288  gpu->PrintfDeviceVector(partials3, kPartialsSize, r);
1289  else
1290  gpu->PrintfDeviceVector(partials3, kPartialsSize, 1.0, &signal, r);
1291 #endif
1292  }
1293 
1294 #ifdef BEAGLE_DEBUG_SYNCH
1295  gpu->Synchronize();
1296 #endif
1297 
1298 #ifdef BEAGLE_DEBUG_FLOW
1299  fprintf(stderr, "\tLeaving BeagleGPUImpl::updatePartials\n");
1300 #endif
1301 
1302  return BEAGLE_SUCCESS;
1303 }
1304 
1305 BEAGLE_GPU_TEMPLATE
1306 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::waitForPartials(const int* /*destinationPartials*/,
1307  int /*destinationPartialsCount*/) {
1308 #ifdef BEAGLE_DEBUG_FLOW
1309  fprintf(stderr, "\tEntering BeagleGPUImpl::waitForPartials\n");
1310 #endif
1311 
1312 #ifdef BEAGLE_DEBUG_FLOW
1313  fprintf(stderr, "\tLeaving BeagleGPUImpl::waitForPartials\n");
1314 #endif
1315 
1316  return BEAGLE_SUCCESS;
1317 }
1318 
1319 BEAGLE_GPU_TEMPLATE
1320 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::accumulateScaleFactors(const int* scalingIndices,
1321  int count,
1322  int cumulativeScalingIndex) {
1323 #ifdef BEAGLE_DEBUG_FLOW
1324  fprintf(stderr, "\tEntering BeagleGPUImpl::accumulateScaleFactors\n");
1325 #endif
1326 
1327  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1328  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1329  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1330  gpu->Synchronize();
1331  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1332  }
1333  }
1334 
1335  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1336 
1337  for(int n = 0; n < count; n++) {
1338  int sIndex = scalingIndices[n] - kTipCount;
1339  hPtrQueue[n] = sIndex;
1340  }
1341 
1342  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1343 
1344  kernels->AccumulateFactorsAutoScaling(dScalingFactors[0], dPtrQueue, dAccumulatedScalingFactors, count, kPaddedPatternCount, kScaleBufferSize);
1345 
1346  } else {
1347  for(int n = 0; n < count; n++)
1348  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1349  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1350 
1351 
1352  #ifdef CUDA
1353  // Compute scaling factors at the root
1354  kernels->AccumulateFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex], count, kPaddedPatternCount);
1355  #else // OpenCL
1356  for (int i = 0; i < count; i++) {
1357  kernels->AccumulateFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1358  1, kPaddedPatternCount);
1359  }
1360  #endif
1361  }
1362 
1363 #ifdef BEAGLE_DEBUG_SYNCH
1364  gpu->Synchronize();
1365 #endif
1366 
1367 #ifdef BEAGLE_DEBUG_VALUES
1368  Real r = 0;
1369  fprintf(stderr, "scaling factors = ");
1370  gpu->PrintfDeviceVector(dScalingFactors[cumulativeScalingIndex], kPaddedPatternCount, r);
1371 #endif
1372 
1373 #ifdef BEAGLE_DEBUG_FLOW
1374  fprintf(stderr, "\tLeaving BeagleGPUImpl::accumulateScaleFactors\n");
1375 #endif
1376 
1377  return BEAGLE_SUCCESS;
1378 }
1379 
1380 BEAGLE_GPU_TEMPLATE
1381 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::removeScaleFactors(const int* scalingIndices,
1382  int count,
1383  int cumulativeScalingIndex) {
1384 
1385 #ifdef BEAGLE_DEBUG_FLOW
1386  fprintf(stderr, "\tEntering BeagleGPUImpl::removeScaleFactors\n");
1387 #endif
1388 
1389  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1390  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex]) {
1391  gpu->MemcpyDeviceToDevice(dScalingFactorsMaster[cumulativeScalingIndex], dScalingFactors[cumulativeScalingIndex], sizeof(Real)*kScaleBufferSize);
1392  gpu->Synchronize();
1393  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1394  }
1395  }
1396 
1397  for(int n = 0; n < count; n++)
1398  hPtrQueue[n] = scalingIndices[n] * kScaleBufferSize;
1399  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1400 
1401 #ifdef CUDA
1402  // Compute scaling factors at the root
1403  kernels->RemoveFactorsDynamicScaling(dScalingFactors[0], dPtrQueue, dScalingFactors[cumulativeScalingIndex],
1404  count, kPaddedPatternCount);
1405 #else // OpenCL
1406  for (int i = 0; i < count; i++) {
1407  kernels->RemoveFactorsDynamicScaling(dScalingFactors[scalingIndices[i]], dScalingFactors[cumulativeScalingIndex],
1408  1, kPaddedPatternCount);
1409  }
1410 
1411 #endif
1412 
1413 #ifdef BEAGLE_DEBUG_SYNCH
1414  gpu->Synchronize();
1415 #endif
1416 
1417 #ifdef BEAGLE_DEBUG_FLOW
1418  fprintf(stderr, "\tLeaving BeagleGPUImpl::removeScaleFactors\n");
1419 #endif
1420 
1421  return BEAGLE_SUCCESS;
1422 }
1423 
1424 BEAGLE_GPU_TEMPLATE
1425 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::resetScaleFactors(int cumulativeScalingIndex) {
1426 
1427 #ifdef BEAGLE_DEBUG_FLOW
1428  fprintf(stderr, "\tEntering BeagleGPUImpl::resetScaleFactors\n");
1429 #endif
1430 
1431  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1432  if (dScalingFactors[cumulativeScalingIndex] != dScalingFactorsMaster[cumulativeScalingIndex])
1433  dScalingFactors[cumulativeScalingIndex] = dScalingFactorsMaster[cumulativeScalingIndex];
1434 
1435  if (dScalingFactors[cumulativeScalingIndex] == 0) {
1436  dScalingFactors[cumulativeScalingIndex] = gpu->AllocateMemory(kScaleBufferSize * sizeof(Real));
1437  dScalingFactorsMaster[cumulativeScalingIndex] = dScalingFactors[cumulativeScalingIndex];
1438  }
1439  }
1440 
1441  Real* zeroes = (Real*) gpu->CallocHost(sizeof(Real), kPaddedPatternCount);
1442 
1443  // Fill with zeroes
1444  gpu->MemcpyHostToDevice(dScalingFactors[cumulativeScalingIndex], zeroes,
1445  sizeof(Real) * kPaddedPatternCount);
1446 
1447  gpu->FreeHostMemory(zeroes);
1448 
1449 #ifdef BEAGLE_DEBUG_SYNCH
1450  gpu->Synchronize();
1451 #endif
1452 
1453 #ifdef BEAGLE_DEBUG_FLOW
1454  fprintf(stderr, "\tLeaving BeagleGPUImpl::resetScaleFactors\n");
1455 #endif
1456 
1457  return BEAGLE_SUCCESS;
1458 }
1459 
1460 BEAGLE_GPU_TEMPLATE
1461 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::copyScaleFactors(int destScalingIndex,
1462  int srcScalingIndex) {
1463 #ifdef BEAGLE_DEBUG_FLOW
1464  fprintf(stderr, "\tEntering BeagleGPUImpl::copyScaleFactors\n");
1465 #endif
1466 
1467  if (kFlags & BEAGLE_FLAG_SCALING_DYNAMIC) {
1468  dScalingFactors[destScalingIndex] = dScalingFactors[srcScalingIndex];
1469  } else {
1470  gpu->MemcpyDeviceToDevice(dScalingFactors[destScalingIndex], dScalingFactors[srcScalingIndex], sizeof(Real)*kScaleBufferSize);
1471  }
1472 #ifdef BEAGLE_DEBUG_SYNCH
1473  gpu->Synchronize();
1474 #endif
1475 
1476 #ifdef BEAGLE_DEBUG_FLOW
1477  fprintf(stderr, "\tLeaving BeagleGPUImpl::copyScaleFactors\n");
1478 #endif
1479 
1480  return BEAGLE_SUCCESS;
1481 }
1482 
1483 BEAGLE_GPU_TEMPLATE
1484 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::calculateRootLogLikelihoods(const int* bufferIndices,
1485  const int* categoryWeightsIndices,
1486  const int* stateFrequenciesIndices,
1487  const int* cumulativeScaleIndices,
1488  int count,
1489  double* outSumLogLikelihood) {
1490 
1491 #ifdef BEAGLE_DEBUG_FLOW
1492  fprintf(stderr, "\tEntering BeagleGPUImpl::calculateRootLogLikelihoods\n");
1493 #endif
1494 
1495  int returnCode = BEAGLE_SUCCESS;
1496 
1497  if (count == 1) {
1498  const int rootNodeIndex = bufferIndices[0];
1499  const int categoryWeightsIndex = categoryWeightsIndices[0];
1500  const int stateFrequenciesIndex = stateFrequenciesIndices[0];
1501 
1502 
1503  GPUPtr dCumulativeScalingFactor;
1504  bool scale = 1;
1505  if (kFlags & BEAGLE_FLAG_SCALING_AUTO)
1506  dCumulativeScalingFactor = dAccumulatedScalingFactors;
1507  else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)
1508  dCumulativeScalingFactor = dScalingFactors[bufferIndices[0] - kTipCount];
1509  else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE)
1510  dCumulativeScalingFactor = dScalingFactors[cumulativeScaleIndices[0]];
1511  else
1512  scale = 0;
1513 
1514 #ifdef BEAGLE_DEBUG_VALUES
1515  Real r = 0;
1516  fprintf(stderr,"root partials = \n");
1517  gpu->PrintfDeviceVector(dPartials[rootNodeIndex], kPaddedPatternCount, r);
1518 #endif
1519 
1520  if (scale) {
1521  kernels->IntegrateLikelihoodsDynamicScaling(dIntegrationTmp, dPartials[rootNodeIndex],
1522  dWeights[categoryWeightsIndex],
1523  dFrequencies[stateFrequenciesIndex],
1524  dCumulativeScalingFactor,
1525  kPaddedPatternCount,
1526  kCategoryCount);
1527  } else {
1528  kernels->IntegrateLikelihoods(dIntegrationTmp, dPartials[rootNodeIndex],
1529  dWeights[categoryWeightsIndex],
1530  dFrequencies[stateFrequenciesIndex],
1531  kPaddedPatternCount, kCategoryCount);
1532  }
1533 
1534 #ifdef BEAGLE_DEBUG_VALUES
1535  fprintf(stderr,"before SumSites1 = \n");
1536  gpu->PrintfDeviceVector(dIntegrationTmp, kPaddedPatternCount, r);
1537 #endif
1538 
1539  kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights,
1540  kPatternCount);
1541 
1542  gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount);
1543 
1544  *outSumLogLikelihood = 0.0;
1545  for (int i = 0; i < kSumSitesBlockCount; i++) {
1546  if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
1547  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1548 
1549  *outSumLogLikelihood += hLogLikelihoodsCache[i];
1550  }
1551 
1552  } else {
1553  // TODO: evaluate performance, maybe break up kernels below for each subsetIndex case
1554 
1555  if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1556  for(int n = 0; n < count; n++) {
1557  int cumulativeScalingFactor = bufferIndices[n] - kTipCount;
1558  hPtrQueue[n] = cumulativeScalingFactor * kScaleBufferSize;
1559  }
1560  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1561  } else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) {
1562  for(int n = 0; n < count; n++)
1563  hPtrQueue[n] = cumulativeScaleIndices[n] * kScaleBufferSize;
1564  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1565  }
1566 
1567  for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
1568 
1569  const GPUPtr tmpDWeights = dWeights[categoryWeightsIndices[subsetIndex]];
1570  const GPUPtr tmpDFrequencies = dFrequencies[stateFrequenciesIndices[subsetIndex]];
1571  const int rootNodeIndex = bufferIndices[subsetIndex];
1572 
1573  if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE || (kFlags & BEAGLE_FLAG_SCALING_ALWAYS)) {
1574  kernels->IntegrateLikelihoodsFixedScaleMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights,
1575  tmpDFrequencies, dScalingFactors[0], dPtrQueue, dMaxScalingFactors,
1576  dIndexMaxScalingFactors,
1577  kPaddedPatternCount,
1578  kCategoryCount, count, subsetIndex);
1579  } else {
1580  if (subsetIndex == 0) {
1581  kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights,
1582  tmpDFrequencies,
1583  kPaddedPatternCount, kCategoryCount, 0);
1584  } else if (subsetIndex == count - 1) {
1585  kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights,
1586  tmpDFrequencies,
1587  kPaddedPatternCount, kCategoryCount, 1);
1588  } else {
1589  kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartials[rootNodeIndex], tmpDWeights,
1590  tmpDFrequencies,
1591  kPaddedPatternCount, kCategoryCount, 2);
1592  }
1593  }
1594 
1595 
1596  kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights,
1597  kPatternCount);
1598 
1599  gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount);
1600 
1601  *outSumLogLikelihood = 0.0;
1602  for (int i = 0; i < kSumSitesBlockCount; i++) {
1603  if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
1604  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1605 
1606  *outSumLogLikelihood += hLogLikelihoodsCache[i];
1607  }
1608  }
1609  }
1610 
1611 #ifdef BEAGLE_DEBUG_VALUES
1612  Real r = 0;
1613  fprintf(stderr, "parent = \n");
1614  gpu->PrintfDeviceVector(dIntegrationTmp, kPatternCount, r);
1615 #endif
1616 
1617 
1618 #ifdef BEAGLE_DEBUG_FLOW
1619  fprintf(stderr, "\tLeaving BeagleGPUImpl::calculateRootLogLikelihoods\n");
1620 #endif
1621 
1622  return returnCode;
1623 }
1624 
1625 BEAGLE_GPU_TEMPLATE
1626 int BeagleGPUImpl<BEAGLE_GPU_GENERIC>::calculateEdgeLogLikelihoods(const int* parentBufferIndices,
1627  const int* childBufferIndices,
1628  const int* probabilityIndices,
1629  const int* firstDerivativeIndices,
1630  const int* secondDerivativeIndices,
1631  const int* categoryWeightsIndices,
1632  const int* stateFrequenciesIndices,
1633  const int* cumulativeScaleIndices,
1634  int count,
1635  double* outSumLogLikelihood,
1636  double* outSumFirstDerivative,
1637  double* outSumSecondDerivative) {
1638 
1639 #ifdef BEAGLE_DEBUG_FLOW
1640  fprintf(stderr, "\tEntering BeagleGPUImpl::calculateEdgeLogLikelihoods\n");
1641 #endif
1642 
1643  int returnCode = BEAGLE_SUCCESS;
1644 
1645  if (count == 1) {
1646 
1647 
1648  const int parIndex = parentBufferIndices[0];
1649  const int childIndex = childBufferIndices[0];
1650  const int probIndex = probabilityIndices[0];
1651 
1652  const int categoryWeightsIndex = categoryWeightsIndices[0];
1653  const int stateFrequenciesIndex = stateFrequenciesIndices[0];
1654 
1655 
1656  GPUPtr partialsParent = dPartials[parIndex];
1657  GPUPtr partialsChild = dPartials[childIndex];
1658  GPUPtr statesChild = dStates[childIndex];
1659  GPUPtr transMatrix = dMatrices[probIndex];
1660 
1661 
1662  GPUPtr dCumulativeScalingFactor;
1663  bool scale = 1;
1664  if (kFlags & BEAGLE_FLAG_SCALING_AUTO) {
1665  dCumulativeScalingFactor = dAccumulatedScalingFactors;
1666  } else if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1667  int cumulativeScalingFactor = kInternalPartialsBufferCount;
1668  int child1ScalingIndex = parIndex - kTipCount;
1669  int child2ScalingIndex = childIndex - kTipCount;
1670  resetScaleFactors(cumulativeScalingFactor);
1671  if (child1ScalingIndex >= 0 && child2ScalingIndex >= 0) {
1672  int scalingIndices[2] = {child1ScalingIndex, child2ScalingIndex};
1673  accumulateScaleFactors(scalingIndices, 2, cumulativeScalingFactor);
1674  } else if (child1ScalingIndex >= 0) {
1675  int scalingIndices[1] = {child1ScalingIndex};
1676  accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactor);
1677  } else if (child2ScalingIndex >= 0) {
1678  int scalingIndices[1] = {child2ScalingIndex};
1679  accumulateScaleFactors(scalingIndices, 1, cumulativeScalingFactor);
1680  }
1681  dCumulativeScalingFactor = dScalingFactors[cumulativeScalingFactor];
1682  } else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) {
1683  dCumulativeScalingFactor = dScalingFactors[cumulativeScaleIndices[0]];
1684  } else {
1685  scale = 0;
1686  }
1687 
1688  if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
1689  if (statesChild != 0) {
1690  kernels->StatesPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, statesChild,
1691  transMatrix, kPaddedPatternCount,
1692  kCategoryCount);
1693  } else {
1694  kernels->PartialsPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, partialsChild,
1695  transMatrix, kPaddedPatternCount,
1696  kCategoryCount);
1697  }
1698 
1699 
1700  if (scale) {
1701  kernels->IntegrateLikelihoodsDynamicScaling(dIntegrationTmp, dPartialsTmp, dWeights[categoryWeightsIndex],
1702  dFrequencies[stateFrequenciesIndex],
1703  dCumulativeScalingFactor,
1704  kPaddedPatternCount, kCategoryCount);
1705  } else {
1706  kernels->IntegrateLikelihoods(dIntegrationTmp, dPartialsTmp, dWeights[categoryWeightsIndex],
1707  dFrequencies[stateFrequenciesIndex],
1708  kPaddedPatternCount, kCategoryCount);
1709  }
1710 
1711  kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights,
1712  kPatternCount);
1713 
1714  gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount);
1715 
1716  *outSumLogLikelihood = 0.0;
1717  for (int i = 0; i < kSumSitesBlockCount; i++) {
1718  if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
1719  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1720 
1721  *outSumLogLikelihood += hLogLikelihoodsCache[i];
1722  }
1723  } else if (secondDerivativeIndices == NULL) {
1724  // TODO: remove this "hack" for a proper version that only calculates firstDeriv
1725 
1726  const int firstDerivIndex = firstDerivativeIndices[0];
1727  GPUPtr firstDerivMatrix = dMatrices[firstDerivIndex];
1728  GPUPtr secondDerivMatrix = dMatrices[firstDerivIndex];
1729 
1730  if (statesChild != 0) {
1731  // TODO: test GPU derivative matrices for statesPartials (including extra ambiguity column)
1732  kernels->StatesPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1733  partialsParent, statesChild,
1734  transMatrix, firstDerivMatrix, secondDerivMatrix,
1735  kPaddedPatternCount, kCategoryCount);
1736  } else {
1737  kernels->PartialsPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1738  partialsParent, partialsChild,
1739  transMatrix, firstDerivMatrix, secondDerivMatrix,
1740  kPaddedPatternCount, kCategoryCount);
1741 
1742  }
1743 
1744  if (scale) {
1745  kernels->IntegrateLikelihoodsDynamicScalingSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv,
1746  dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1747  dWeights[categoryWeightsIndex],
1748  dFrequencies[stateFrequenciesIndex],
1749  dCumulativeScalingFactor,
1750  kPaddedPatternCount, kCategoryCount);
1751  } else {
1752  kernels->IntegrateLikelihoodsSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv,
1753  dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1754  dWeights[categoryWeightsIndex],
1755  dFrequencies[stateFrequenciesIndex],
1756  kPaddedPatternCount, kCategoryCount);
1757  }
1758 
1759 
1760  kernels->SumSites2(dIntegrationTmp, dSumLogLikelihood, dOutFirstDeriv, dSumFirstDeriv, dPatternWeights,
1761  kPatternCount);
1762 
1763  gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount);
1764 
1765  *outSumLogLikelihood = 0.0;
1766  for (int i = 0; i < kSumSitesBlockCount; i++) {
1767  if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
1768  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1769 
1770  *outSumLogLikelihood += hLogLikelihoodsCache[i];
1771  }
1772 
1773  gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumFirstDeriv, sizeof(Real) * kSumSitesBlockCount);
1774 
1775  *outSumFirstDerivative = 0.0;
1776  for (int i = 0; i < kSumSitesBlockCount; i++) {
1777  *outSumFirstDerivative += hLogLikelihoodsCache[i];
1778  }
1779 
1780  } else {
1781  // TODO: improve performance of GPU implementation of derivatives for calculateEdgeLnL
1782 
1783  const int firstDerivIndex = firstDerivativeIndices[0];
1784  const int secondDerivIndex = secondDerivativeIndices[0];
1785  GPUPtr firstDerivMatrix = dMatrices[firstDerivIndex];
1786  GPUPtr secondDerivMatrix = dMatrices[secondDerivIndex];
1787 
1788  if (statesChild != 0) {
1789  // TODO: test GPU derivative matrices for statesPartials (including extra ambiguity column)
1790  kernels->StatesPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1791  partialsParent, statesChild,
1792  transMatrix, firstDerivMatrix, secondDerivMatrix,
1793  kPaddedPatternCount, kCategoryCount);
1794  } else {
1795  kernels->PartialsPartialsEdgeLikelihoodsSecondDeriv(dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1796  partialsParent, partialsChild,
1797  transMatrix, firstDerivMatrix, secondDerivMatrix,
1798  kPaddedPatternCount, kCategoryCount);
1799 
1800  }
1801 
1802  if (scale) {
1803  kernels->IntegrateLikelihoodsDynamicScalingSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv,
1804  dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1805  dWeights[categoryWeightsIndex],
1806  dFrequencies[stateFrequenciesIndex],
1807  dCumulativeScalingFactor,
1808  kPaddedPatternCount, kCategoryCount);
1809  } else {
1810  kernels->IntegrateLikelihoodsSecondDeriv(dIntegrationTmp, dOutFirstDeriv, dOutSecondDeriv,
1811  dPartialsTmp, dFirstDerivTmp, dSecondDerivTmp,
1812  dWeights[categoryWeightsIndex],
1813  dFrequencies[stateFrequenciesIndex],
1814  kPaddedPatternCount, kCategoryCount);
1815  }
1816 
1817  kernels->SumSites3(dIntegrationTmp, dSumLogLikelihood, dOutFirstDeriv, dSumFirstDeriv, dOutSecondDeriv, dSumSecondDeriv, dPatternWeights,
1818  kPatternCount);
1819 
1820  gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount);
1821 
1822  *outSumLogLikelihood = 0.0;
1823  for (int i = 0; i < kSumSitesBlockCount; i++) {
1824  if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
1825  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1826 
1827  *outSumLogLikelihood += hLogLikelihoodsCache[i];
1828  }
1829 
1830  gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumFirstDeriv, sizeof(Real) * kSumSitesBlockCount);
1831 
1832  *outSumFirstDerivative = 0.0;
1833  for (int i = 0; i < kSumSitesBlockCount; i++) {
1834  *outSumFirstDerivative += hLogLikelihoodsCache[i];
1835  }
1836 
1837  gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumSecondDeriv, sizeof(Real) * kSumSitesBlockCount);
1838 
1839  *outSumSecondDerivative = 0.0;
1840  for (int i = 0; i < kSumSitesBlockCount; i++) {
1841  *outSumSecondDerivative += hLogLikelihoodsCache[i];
1842  }
1843  }
1844 
1845 
1846  } else { //count > 1
1847  if (firstDerivativeIndices == NULL && secondDerivativeIndices == NULL) {
1848 
1849  if (kFlags & BEAGLE_FLAG_SCALING_ALWAYS) {
1850  fprintf(stderr,"BeagleGPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and SCALING_ALWAYS\n");
1851  } else if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) {
1852  for(int n = 0; n < count; n++)
1853  hPtrQueue[n] = cumulativeScaleIndices[n] * kScaleBufferSize;
1854  gpu->MemcpyHostToDevice(dPtrQueue, hPtrQueue, sizeof(unsigned int) * count);
1855  }
1856 
1857  for (int subsetIndex = 0 ; subsetIndex < count; ++subsetIndex ) {
1858 
1859  const int parIndex = parentBufferIndices[subsetIndex];
1860  const int childIndex = childBufferIndices[subsetIndex];
1861  const int probIndex = probabilityIndices[subsetIndex];
1862 
1863  GPUPtr partialsParent = dPartials[parIndex];
1864  GPUPtr partialsChild = dPartials[childIndex];
1865  GPUPtr statesChild = dStates[childIndex];
1866  GPUPtr transMatrix = dMatrices[probIndex];
1867 
1868  const GPUPtr tmpDWeights = dWeights[categoryWeightsIndices[subsetIndex]];
1869  const GPUPtr tmpDFrequencies = dFrequencies[stateFrequenciesIndices[subsetIndex]];
1870 
1871  if (statesChild != 0) {
1872  kernels->StatesPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, statesChild,
1873  transMatrix, kPaddedPatternCount,
1874  kCategoryCount);
1875  } else {
1876  kernels->PartialsPartialsEdgeLikelihoods(dPartialsTmp, partialsParent, partialsChild,
1877  transMatrix, kPaddedPatternCount,
1878  kCategoryCount);
1879  }
1880 
1881  if (cumulativeScaleIndices[0] != BEAGLE_OP_NONE) {
1882  kernels->IntegrateLikelihoodsFixedScaleMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights,
1883  tmpDFrequencies, dScalingFactors[0], dPtrQueue, dMaxScalingFactors,
1884  dIndexMaxScalingFactors,
1885  kPaddedPatternCount,
1886  kCategoryCount, count, subsetIndex);
1887  } else {
1888  if (subsetIndex == 0) {
1889  kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights,
1890  tmpDFrequencies,
1891  kPaddedPatternCount, kCategoryCount, 0);
1892  } else if (subsetIndex == count - 1) {
1893  kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights,
1894  tmpDFrequencies,
1895  kPaddedPatternCount, kCategoryCount, 1);
1896  } else {
1897  kernels->IntegrateLikelihoodsMulti(dIntegrationTmp, dPartialsTmp, tmpDWeights,
1898  tmpDFrequencies,
1899  kPaddedPatternCount, kCategoryCount, 2);
1900  }
1901  }
1902 
1903  kernels->SumSites1(dIntegrationTmp, dSumLogLikelihood, dPatternWeights,
1904  kPatternCount);
1905 
1906  gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dSumLogLikelihood, sizeof(Real) * kSumSitesBlockCount);
1907 
1908  *outSumLogLikelihood = 0.0;
1909  for (int i = 0; i < kSumSitesBlockCount; i++) {
1910  if (!(hLogLikelihoodsCache[i] - hLogLikelihoodsCache[i] == 0.0))
1911  returnCode = BEAGLE_ERROR_FLOATING_POINT;
1912 
1913  *outSumLogLikelihood += hLogLikelihoodsCache[i];
1914  }
1915  }
1916 
1917  } else {
1918  fprintf(stderr,"BeagleGPUImpl::calculateEdgeLogLikelihoods not yet implemented for count > 1 and derivatives\n");
1919  returnCode = BEAGLE_ERROR_GENERAL;
1920  }
1921  }
1922 
1923 
1924 #ifdef BEAGLE_DEBUG_FLOW
1925  fprintf(stderr, "\tLeaving BeagleGPUImpl::calculateEdgeLogLikelihoods\n");
1926 #endif
1927 
1928  return returnCode;
1929 }
1930 
1931 template<>
1932 int BeagleGPUImpl<float>::getSiteLogLikelihoods(double* outLogLikelihoods) {
1933 
1934 #ifdef BEAGLE_DEBUG_FLOW
1935  fprintf(stderr, "\tEntering BeagleGPUImpl::getSiteLogLikelihoods\n");
1936 #endif
1937 
1938 //#ifdef DOUBLE_PRECISION
1939 // gpu->MemcpyDeviceToHost(outLogLikelihoods, dIntegrationTmp, sizeof(Real) * kPatternCount);
1940 //#else
1941  gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dIntegrationTmp, sizeof(float) * kPatternCount);
1942 // MEMCNV(outLogLikelihoods, hLogLikelihoodsCache, kPatternCount, double);
1943  beagleMemCpy(outLogLikelihoods, hLogLikelihoodsCache, kPatternCount);
1944 //#endif
1945 
1946 #ifdef BEAGLE_DEBUG_FLOW
1947  fprintf(stderr, "\tLeaving BeagleGPUImpl::getSiteLogLikelihoods\n");
1948 #endif
1949 
1950  return BEAGLE_SUCCESS;
1951 }
1952 
1953 template<>
1954 int BeagleGPUImpl<double>::getSiteLogLikelihoods(double* outLogLikelihoods) {
1955 
1956 #ifdef BEAGLE_DEBUG_FLOW
1957  fprintf(stderr, "\tEntering BeagleGPUImpl::getSiteLogLikelihoods\n");
1958 #endif
1959 
1960 //#ifdef DOUBLE_PRECISION
1961  gpu->MemcpyDeviceToHost(outLogLikelihoods, dIntegrationTmp, sizeof(double) * kPatternCount);
1962 //#else
1963 // gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dIntegrationTmp, sizeof(Real) * kPatternCount);
1964 // MEMCNV(outLogLikelihoods, hLogLikelihoodsCache, kPatternCount, double);
1965 //#endif
1966 
1967 #ifdef BEAGLE_DEBUG_FLOW
1968  fprintf(stderr, "\tLeaving BeagleGPUImpl::getSiteLogLikelihoods\n");
1969 #endif
1970 
1971  return BEAGLE_SUCCESS;
1972 }
1973 
1974 template<>
1975 int BeagleGPUImpl<double>::getSiteDerivatives(double* outFirstDerivatives,
1976  double* outSecondDerivatives) {
1977 
1978 #ifdef BEAGLE_DEBUG_FLOW
1979  fprintf(stderr, "\tEntering BeagleGPUImpl::getSiteDerivatives\n");
1980 #endif
1981 
1982 //#ifdef DOUBLE_PRECISION
1983  gpu->MemcpyDeviceToHost(outFirstDerivatives, dOutFirstDeriv, sizeof(double) * kPatternCount);
1984 //#else
1985 // gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutFirstDeriv, sizeof(Real) * kPatternCount);
1986 // MEMCNV(outFirstDerivatives, hLogLikelihoodsCache, kPatternCount, double);
1987 //#endif
1988 
1989  if (outSecondDerivatives != NULL) {
1990 //#ifdef DOUBLE_PRECISION
1991  gpu->MemcpyDeviceToHost(outSecondDerivatives, dOutSecondDeriv, sizeof(double) * kPatternCount);
1992 //#else
1993 // gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutSecondDeriv, sizeof(Real) * kPatternCount);
1994 // MEMCNV(outSecondDerivatives, hLogLikelihoodsCache, kPatternCount, double);
1995 //#endif
1996  }
1997 
1998 
1999 #ifdef BEAGLE_DEBUG_FLOW
2000  fprintf(stderr, "\tLeaving BeagleGPUImpl::getSiteDerivatives\n");
2001 #endif
2002 
2003  return BEAGLE_SUCCESS;
2004 }
2005 
2006 template<>
2007 int BeagleGPUImpl<float>::getSiteDerivatives(double* outFirstDerivatives,
2008  double* outSecondDerivatives) {
2009 
2010 #ifdef BEAGLE_DEBUG_FLOW
2011  fprintf(stderr, "\tEntering BeagleGPUImpl::getSiteDerivatives\n");
2012 #endif
2013 
2014 //#ifdef DOUBLE_PRECISION
2015 // gpu->MemcpyDeviceToHost(outFirstDerivatives, dOutFirstDeriv, sizeof(Real) * kPatternCount);
2016 //#else
2017  gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutFirstDeriv, sizeof(float) * kPatternCount);
2018  beagleMemCpy(outFirstDerivatives, hLogLikelihoodsCache, kPatternCount);
2019 // MEMCNV(outFirstDerivatives, hLogLikelihoodsCache, kPatternCount, double);
2020 //#endif
2021 
2022  if (outSecondDerivatives != NULL) {
2023 //#ifdef DOUBLE_PRECISION
2024 // gpu->MemcpyDeviceToHost(outSecondDerivatives, dOutSecondDeriv, sizeof(Real) * kPatternCount);
2025 //#else
2026  gpu->MemcpyDeviceToHost(hLogLikelihoodsCache, dOutSecondDeriv, sizeof(float) * kPatternCount);
2027  beagleMemCpy(outSecondDerivatives, hLogLikelihoodsCache, kPatternCount);
2028 // MEMCNV(outSecondDerivatives, hLogLikelihoodsCache, kPatternCount, double);
2029 //#endif
2030  }
2031 
2032 
2033 #ifdef BEAGLE_DEBUG_FLOW
2034  fprintf(stderr, "\tLeaving BeagleGPUImpl::getSiteDerivatives\n");
2035 #endif
2036 
2037  return BEAGLE_SUCCESS;
2038 }
2039 
2041 // BeagleGPUImplFactory public methods
2042 
2043 BEAGLE_GPU_TEMPLATE
2044 BeagleImpl* BeagleGPUImplFactory<BEAGLE_GPU_GENERIC>::createImpl(int tipCount,
2045  int partialsBufferCount,
2046  int compactBufferCount,
2047  int stateCount,
2048  int patternCount,
2049  int eigenBufferCount,
2050  int matrixBufferCount,
2051  int categoryCount,
2052  int scaleBufferCount,
2053  int resourceNumber,
2054  long preferenceFlags,
2055  long requirementFlags,
2056  int* errorCode) {
2057  BeagleImpl* impl = new BeagleGPUImpl<BEAGLE_GPU_GENERIC>();
2058  try {
2059  *errorCode =
2060  impl->createInstance(tipCount, partialsBufferCount, compactBufferCount, stateCount,
2061  patternCount, eigenBufferCount, matrixBufferCount,
2062  categoryCount,scaleBufferCount, resourceNumber, preferenceFlags, requirementFlags);
2063  if (*errorCode == BEAGLE_SUCCESS) {
2064  return impl;
2065  }
2066  delete impl;
2067  return NULL;
2068  }
2069  catch(...)
2070  {
2071  delete impl;
2072  *errorCode = BEAGLE_ERROR_GENERAL;
2073  throw;
2074  }
2075  delete impl;
2076  *errorCode = BEAGLE_ERROR_GENERAL;
2077  return NULL;
2078 }
2079 
2080 template<>
2081 const char* BeagleGPUImplFactory<double>::getName() {
2082  return "GPU-DP";
2083 }
2084 
2085 template<>
2086 const char* BeagleGPUImplFactory<float>::getName() {
2087  return "GPU-SP";
2088 }
2089 
2090 template<>
2091 void modifyFlagsForPrecision(long *flags, double r) {
2092  *flags |= BEAGLE_FLAG_PRECISION_DOUBLE;
2093 }
2094 
2095 template<>
2096 void modifyFlagsForPrecision(long *flags, float r) {
2097  *flags |= BEAGLE_FLAG_PRECISION_SINGLE;
2098 }
2099 
2100 BEAGLE_GPU_TEMPLATE
2101 const long BeagleGPUImplFactory<BEAGLE_GPU_GENERIC>::getFlags() {
2102  long flags = BEAGLE_FLAG_COMPUTATION_SYNCH |
2103  BEAGLE_FLAG_SCALING_MANUAL | BEAGLE_FLAG_SCALING_ALWAYS | BEAGLE_FLAG_SCALING_AUTO | BEAGLE_FLAG_SCALING_DYNAMIC |
2104  BEAGLE_FLAG_THREADING_NONE |
2105  BEAGLE_FLAG_VECTOR_NONE |
2106  BEAGLE_FLAG_PROCESSOR_GPU |
2107  BEAGLE_FLAG_SCALERS_LOG | BEAGLE_FLAG_SCALERS_RAW |
2108  BEAGLE_FLAG_EIGEN_COMPLEX | BEAGLE_FLAG_EIGEN_REAL |
2109  BEAGLE_FLAG_INVEVEC_STANDARD | BEAGLE_FLAG_INVEVEC_TRANSPOSED;
2110 
2111  Real r = 0;
2112  modifyFlagsForPrecision(&flags, r);
2113  return flags;
2114 }
2115 
2116 } // end of gpu namespace
2117 } // end of beagle namespace