Actual source code: sveccuda.cu

slepc-3.12.1 2019-11-08
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV implemented as a single Vec (CUDA version)
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include "../src/sys/classes/bv/impls/svec/svec.h"
 16: #include <petsccublas.h>

 18: #if defined(PETSC_USE_COMPLEX)
 19: #include <thrust/device_ptr.h>
 20: #endif

 22: #define BLOCKSIZE 64

 24: /*
 25:     B := alpha*A + beta*B

 27:     A,B are nxk (ld=n)
 28:  */
 29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
 30: {
 32:   PetscBLASInt   m,one=1;
 33:   cublasStatus_t cberr;
 34:   cublasHandle_t cublasv2handle;

 37:   PetscCUBLASGetHandle(&cublasv2handle);
 38:   PetscBLASIntCast(n_*k_,&m);
 39:   PetscLogGpuTimeBegin();
 40:   if (beta!=(PetscScalar)1.0) {
 41:     cberr = cublasXscal(cublasv2handle,m,&beta,d_B,one);CHKERRCUBLAS(cberr);
 42:     PetscLogGpuFlops(1.0*m);
 43:   }
 44:   cberr = cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one);CHKERRCUBLAS(cberr);
 45:   PetscLogGpuTimeEnd();
 46:   WaitForGPU();CHKERRCUDA(ierr);
 47:   PetscLogGpuFlops(2.0*m);
 48:   return(0);
 49: }

 51: /*
 52:     C := alpha*A*B + beta*C
 53: */
 54: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
 55: {
 56:   PetscErrorCode    ierr;
 57:   BV_SVEC           *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
 58:   const PetscScalar *d_px,*d_A;
 59:   PetscScalar       *d_py,*q,*d_q,*d_B,*d_C;
 60:   PetscInt          m,n,k,ldq,mq;
 61:   cublasStatus_t    cberr;
 62:   cudaError_t       err;
 63:   cublasHandle_t    cublasv2handle;

 66:   m = Y->n;
 67:   n = Y->k-Y->l;
 68:   k = X->k-X->l;
 69:   if (!Y->n) return(0);
 70:   VecCUDAGetArrayRead(x->v,&d_px);
 71:   if (beta==(PetscScalar)0.0) {
 72:     VecCUDAGetArrayWrite(y->v,&d_py);
 73:   } else {
 74:     VecCUDAGetArray(y->v,&d_py);
 75:   }
 76:   d_A = d_px+(X->nc+X->l)*X->n;
 77:   d_C = d_py+(Y->nc+Y->l)*Y->n;
 78:   if (Q) {
 79:     PetscCUBLASGetHandle(&cublasv2handle);
 80:     MatGetSize(Q,&ldq,&mq);
 81:     MatDenseGetArray(Q,&q);
 82:     err = cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar));CHKERRCUDA(err);
 83:     err = cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
 84:     PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar));
 85:     d_B = d_q+Y->l*ldq+X->l;
 86:     PetscLogGpuTimeBegin();
 87:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq,&beta,d_C,m);CHKERRCUBLAS(cberr);
 88:     PetscLogGpuTimeEnd();
 89:     WaitForGPU();CHKERRCUDA(ierr);
 90:     MatDenseRestoreArray(Q,&q);
 91:     err = cudaFree(d_q);CHKERRCUDA(err);
 92:     PetscLogGpuFlops(2.0*m*n*k);
 93:   } else {
 94:     BVAXPY_BLAS_CUDA(Y,m,n,alpha,d_A,beta,d_C);
 95:   }
 96:   VecCUDARestoreArrayRead(x->v,&d_px);
 97:   VecCUDARestoreArrayWrite(y->v,&d_py);
 98:   return(0);
 99: }

101: /*
102:     y := alpha*A*x + beta*y
103: */
104: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
105: {
106:   PetscErrorCode    ierr;
107:   BV_SVEC           *x = (BV_SVEC*)X->data;
108:   const PetscScalar *d_px,*d_A;
109:   PetscScalar       *d_py,*d_q,*d_x,*d_y;
110:   PetscBLASInt      n,k,one=1;
111:   cublasStatus_t    cberr;
112:   cublasHandle_t    cublasv2handle;

115:   n = X->n;
116:   k = X->k-X->l;
117:   PetscCUBLASGetHandle(&cublasv2handle);
118:   VecCUDAGetArrayRead(x->v,&d_px);
119:   if (beta==(PetscScalar)0.0) {
120:     VecCUDAGetArrayWrite(y,&d_py);
121:   } else {
122:     VecCUDAGetArray(y,&d_py);
123:   }
124:   if (!q) {
125:     VecCUDAGetArray(X->buffer,&d_q);
126:   } else {
127:     cudaMalloc((void**)&d_q,k*sizeof(PetscScalar));
128:     cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice);
129:     PetscLogCpuToGpu(k*sizeof(PetscScalar));
130:   }
131:   d_A = d_px+(X->nc+X->l)*X->n;
132:   d_x = d_q;
133:   d_y = d_py;
134:   PetscLogGpuTimeBegin();
135:   cberr = cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one);CHKERRCUBLAS(cberr);
136:   PetscLogGpuTimeEnd();
137:   WaitForGPU();CHKERRCUDA(ierr);
138:   VecCUDARestoreArrayRead(x->v,&d_px);
139:   if (beta==(PetscScalar)0.0) {
140:     VecCUDARestoreArrayWrite(y,&d_py);
141:   } else {
142:     VecCUDARestoreArray(y,&d_py);
143:   }
144:   if (!q) {
145:     VecCUDARestoreArray(X->buffer,&d_q);
146:   } else {
147:     cudaFree(d_q);
148:   }
149:   PetscLogGpuFlops(2.0*n*k);
150:   return(0);
151: }

153: /*
154:     A(:,s:e-1) := A*B(:,s:e-1)
155: */
156: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
157: {
159:   BV_SVEC        *ctx = (BV_SVEC*)V->data;
160:   PetscScalar    *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
161:   PetscInt       m,n,j,k,l,ldq,nq,bs=BLOCKSIZE;
162:   cublasStatus_t cberr;
163:   size_t         freemem,totmem;
164:   cublasHandle_t cublasv2handle;

167:   m = V->n;
168:   n = e-s;
169:   k = V->k-V->l;
170:   if (!m) return(0);
171:   MatGetSize(Q,&ldq,&nq);
172:   VecCUDAGetArray(ctx->v,&d_pv);
173:   MatDenseGetArray(Q,&q);
174:   cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
175:   cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
176:   PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
177:   PetscCUBLASGetHandle(&cublasv2handle);
178:   PetscLogGpuTimeBegin();
179:   /* try to allocate the whole matrix */
180:   cudaMemGetInfo(&freemem,&totmem);
181:   if (freemem>=m*n*sizeof(PetscScalar)) {
182:     cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
183:     d_A = d_pv+(V->nc+V->l)*m;
184:     d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
185:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,m);CHKERRCUBLAS(cberr);
186:     for (j=0;j<n;j++) {
187:       cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
188:     }
189:   } else {
190:     bs = freemem/(m*sizeof(PetscScalar));
191:     cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar));
192:     l = m % bs;
193:     if (l) {
194:       d_A = d_pv+(V->nc+V->l)*m;
195:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
196:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,l);CHKERRCUBLAS(cberr);
197:       for (j=0;j<n;j++) {
198:         cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*l),l*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
199:       }
200:     }
201:     for (;l<m;l+=bs) {
202:       d_A = d_pv+(V->nc+V->l)*m+l;
203:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
204:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,bs);CHKERRCUBLAS(cberr);
205:       for (j=0;j<n;j++) {
206:         cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*bs),bs*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(ierr);
207:       }
208:     }
209:   }
210:   PetscLogGpuTimeEnd();
211:   WaitForGPU();CHKERRCUDA(ierr);
212:   MatDenseRestoreArray(Q,&q);
213:   cudaFree(d_q);
214:   cudaFree(d_work);
215:   VecCUDARestoreArray(ctx->v,&d_pv);
216:   PetscLogGpuFlops(2.0*m*n*k);
217:   return(0);
218: }

220: /*
221:     A(:,s:e-1) := A*B(:,s:e-1)
222: */
223: PetscErrorCode BVMultInPlaceTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
224: {
226:   BV_SVEC        *ctx = (BV_SVEC*)V->data;
227:   PetscScalar    *d_pv,*q,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
228:   PetscInt       m,n,j,k,ldq,nq;
229:   cublasStatus_t cberr;
230:   cublasHandle_t cublasv2handle;

233:   m = V->n;
234:   n = e-s;
235:   k = V->k-V->l;
236:   if (!m) return(0);
237:   MatGetSize(Q,&ldq,&nq);
238:   VecCUDAGetArray(ctx->v,&d_pv);
239:   MatDenseGetArray(Q,&q);
240:   PetscCUBLASGetHandle(&cublasv2handle);
241:   cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar));
242:   cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice);
243:   PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar));
244:   PetscLogGpuTimeBegin();
245:   cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
246:   d_A = d_pv+(V->nc+V->l)*m;
247:   d_B = d_q+V->l*ldq+s;
248:   cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq,&szero,d_work,m);CHKERRCUBLAS(cberr);
249:   for (j=0;j<n;j++) {
250:     cudaMemcpy(d_A+(s-V->l+j)*m,d_work+(j*m),m*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);
251:   }
252:   PetscLogGpuTimeEnd();
253:   WaitForGPU();CHKERRCUDA(ierr);
254:   MatDenseRestoreArray(Q,&q);
255:   cudaFree(d_q);
256:   cudaFree(d_work);
257:   VecCUDARestoreArray(ctx->v,&d_pv);
258:   PetscLogGpuFlops(2.0*m*n*k);
259:   return(0);
260: }

262: /*
263:     C := A'*B
264: */
265: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
266: {
267:   PetscErrorCode    ierr;
268:   BV_SVEC           *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
269:   const PetscScalar *d_px,*d_py,*d_A,*d_B;
270:   PetscScalar       *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
271:   PetscInt          ldm,m,n,k,len,j;
272:   cublasStatus_t    cberr;
273:   cublasHandle_t    cublasv2handle;

276:   m = Y->k-Y->l;
277:   n = X->k-X->l;
278:   k = X->n;
279:   MatGetSize(M,&ldm,NULL);
280:   VecCUDAGetArrayRead(x->v,&d_px);
281:   VecCUDAGetArrayRead(y->v,&d_py);
282:   MatDenseGetArray(M,&pm);
283:   PetscCUBLASGetHandle(&cublasv2handle);
284:   cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar));
285:   d_A = d_py+(Y->nc+Y->l)*Y->n;
286:   d_B = d_px+(X->nc+X->l)*X->n;
287:   C = pm+X->l*ldm+Y->l;
288:   if (x->mpi) {
289:     if (ldm==m) {
290:       BVAllocateWork_Private(X,m*n);
291:       if (k) {
292:         PetscLogGpuTimeBegin();
293:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm);CHKERRCUBLAS(cberr);
294:         PetscLogGpuTimeEnd();
295:         cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
296:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
297:       } else {
298:         PetscArrayzero(X->work,m*n);
299:       }
300:       PetscMPIIntCast(m*n,&len);
301:       MPI_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
302:     } else {
303:       BVAllocateWork_Private(X,2*m*n);
304:       CC = X->work+m*n;
305:       if (k) {
306:         PetscLogGpuTimeBegin();
307:         cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
308:         PetscLogGpuTimeEnd();
309:         cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
310:         PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
311:       } else {
312:         PetscArrayzero(X->work,m*n);
313:       }
314:       PetscMPIIntCast(m*n,&len);
315:       MPI_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
316:       for (j=0;j<n;j++) {
317:         PetscArraycpy(C+j*ldm,CC+j*m,m);
318:       }
319:     }
320:   } else {
321:     if (k) {
322:       BVAllocateWork_Private(X,m*n);
323:       PetscLogGpuTimeBegin();
324:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m);CHKERRCUBLAS(cberr);
325:       PetscLogGpuTimeEnd();
326:       cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
327:       PetscLogGpuToCpu(m*n*sizeof(PetscScalar));
328:       for (j=0;j<n;j++) {
329:         PetscArraycpy(C+j*ldm,X->work+j*m,m);
330:       }
331:     }
332:   }
333:   WaitForGPU();CHKERRCUDA(ierr);
334:   cudaFree(d_work);
335:   MatDenseRestoreArray(M,&pm);
336:   VecCUDARestoreArrayRead(x->v,&d_px);
337:   VecCUDARestoreArrayRead(y->v,&d_py);
338:   PetscLogGpuFlops(2.0*m*n*k);
339:   return(0);
340: }

342: #if defined(PETSC_USE_COMPLEX)
343: struct conjugate
344: {
345:   __host__ __device__
346:     PetscScalar operator()(PetscScalar x)
347:     {
348:       return PetscConj(x);
349:     }
350: };

352: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
353: {
354:   PetscErrorCode                  ierr;
355:   thrust::device_ptr<PetscScalar> ptr;

358:   try {
359:     ptr = thrust::device_pointer_cast(a);
360:     thrust::transform(ptr,ptr+n,ptr,conjugate());
361:     WaitForGPU();CHKERRCUDA(ierr);
362:   } catch (char *ex) {
363:     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
364:   }
365:   return(0);
366: }
367: #endif

369: /*
370:     y := A'*x computed as y' := x'*A
371: */
372: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
373: {
374:   PetscErrorCode    ierr;
375:   BV_SVEC           *x = (BV_SVEC*)X->data;
376:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
377:   PetscScalar       *d_work,szero=0.0,sone=1.0,*qq=q;
378:   PetscBLASInt      n,k,one=1,len;
379:   Vec               z = y;
380:   cublasStatus_t    cberr;
381:   cublasHandle_t    cublasv2handle;

384:   n = X->n;
385:   k = X->k-X->l;
386:   PetscCUBLASGetHandle(&cublasv2handle);
387:   if (X->matrix) {
388:     BV_IPMatMult(X,y);
389:     z = X->Bx;
390:   }
391:   VecCUDAGetArrayRead(x->v,&d_px);
392:   VecCUDAGetArrayRead(z,&d_py);
393:   if (!q) {
394:     VecCUDAGetArrayWrite(X->buffer,&d_work);
395:   } else {
396:     cudaMalloc((void**)&d_work,k*sizeof(PetscScalar));
397:   }
398:   d_A = d_px+(X->nc+X->l)*X->n;
399:   d_x = d_py;
400:   if (x->mpi) {
401:     BVAllocateWork_Private(X,k);
402:     if (n) {
403:       PetscLogGpuTimeBegin();
404: #if defined(PETSC_USE_COMPLEX)
405:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
406:       ConjugateCudaArray(d_work,k);
407: #else
408:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
409: #endif
410:       PetscLogGpuTimeEnd();
411:       cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
412:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
413:     } else {
414:       PetscArrayzero(X->work,k);
415:     }
416:     if (!q) {
417:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
418:       VecGetArray(X->buffer,&qq);
419:     } else {
420:       cudaFree(d_work);
421:     }
422:     PetscMPIIntCast(k,&len);
423:     MPI_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X));
424:     if (!q) { VecRestoreArray(X->buffer,&qq); }
425:   } else {
426:     if (n) {
427:       PetscLogGpuTimeBegin();
428: #if defined(PETSC_USE_COMPLEX)
429:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
430:       ConjugateCudaArray(d_work,k);
431: #else
432:       cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one);CHKERRCUBLAS(cberr);
433: #endif
434:       PetscLogGpuTimeEnd();
435:     }
436:     if (!q) {
437:       VecCUDARestoreArrayWrite(X->buffer,&d_work);
438:     } else {
439:       cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
440:       PetscLogGpuToCpu(k*sizeof(PetscScalar));
441:       cudaFree(d_work);
442:     }
443:   }
444:   WaitForGPU();CHKERRCUDA(ierr);
445:   VecCUDARestoreArrayRead(z,&d_py);
446:   VecCUDARestoreArrayRead(x->v,&d_px);
447:   PetscLogGpuFlops(2.0*n*k);
448:   return(0);
449: }

451: /*
452:     y := A'*x computed as y' := x'*A
453: */
454: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
455: {
456:   PetscErrorCode    ierr;
457:   BV_SVEC           *x = (BV_SVEC*)X->data;
458:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
459:   PetscScalar       *d_y,szero=0.0,sone=1.0;
460:   PetscBLASInt      n,k,one=1;
461:   Vec               z = y;
462:   cublasStatus_t    cberr;
463:   cublasHandle_t    cublasv2handle;

466:   n = X->n;
467:   k = X->k-X->l;
468:   if (X->matrix) {
469:     BV_IPMatMult(X,y);
470:     z = X->Bx;
471:   }
472:   PetscCUBLASGetHandle(&cublasv2handle);
473:   VecCUDAGetArrayRead(x->v,&d_px);
474:   VecCUDAGetArrayRead(z,&d_py);
475:   d_A = d_px+(X->nc+X->l)*X->n;
476:   d_x = d_py;
477:   if (n) {
478:     cudaMalloc((void**)&d_y,k*sizeof(PetscScalar));
479:     PetscLogGpuTimeBegin();
480: #if defined(PETSC_USE_COMPLEX)
481:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
482:     ConjugateCudaArray(d_y,k);
483: #else
484:     cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one);CHKERRCUBLAS(cberr);
485: #endif
486:     PetscLogGpuTimeEnd();
487:     WaitForGPU();CHKERRCUDA(ierr);
488:     cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost);
489:     PetscLogGpuToCpu(k*sizeof(PetscScalar));
490:     cudaFree(d_y);
491:   }
492:   VecCUDARestoreArrayRead(z,&d_py);
493:   VecCUDARestoreArrayRead(x->v,&d_px);
494:   PetscLogGpuFlops(2.0*n*k);
495:   return(0);
496: }

498: /*
499:     Scale n scalars
500: */
501: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
502: {
504:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
505:   PetscScalar    *d_array, *d_A;
506:   PetscBLASInt   n,one=1;
507:   cublasStatus_t cberr;
508:   cublasHandle_t cublasv2handle;

511:   VecCUDAGetArray(ctx->v,&d_array);
512:   if (j<0) {
513:     d_A = d_array+(bv->nc+bv->l)*bv->n;
514:     n = (bv->k-bv->l)*bv->n;
515:   } else {
516:     d_A = d_array+(bv->nc+j)*bv->n;
517:     n = bv->n;
518:   }
519:   if (alpha == (PetscScalar)0.0) {
520:     cudaMemset(d_A,0,n*sizeof(PetscScalar));
521:   } else if (alpha != (PetscScalar)1.0) {
522:     PetscCUBLASGetHandle(&cublasv2handle);
523:     PetscLogGpuTimeBegin();
524:     cberr = cublasXscal(cublasv2handle,n,&alpha,d_A,one);CHKERRCUBLAS(cberr);
525:     PetscLogGpuTimeEnd();
526:     WaitForGPU();CHKERRCUDA(ierr);
527:     PetscLogGpuFlops(1.0*n);
528:   }
529:   VecCUDARestoreArray(ctx->v,&d_array);
530:   return(0);
531: }

533: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
534: {
535:   PetscErrorCode    ierr;
536:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
537:   const PetscScalar *d_pv;
538:   PetscScalar       *d_pw;
539:   PetscInt          j;

542:   VecCUDAGetArrayRead(v->v,&d_pv);
543:   VecCUDAGetArrayWrite(w->v,&d_pw);
544:   for (j=0;j<V->k-V->l;j++) {
545:     VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n);
546:     VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n);
547:     MatMult(A,V->cv[1],W->cv[1]);
548:     VecCUDAResetArray(V->cv[1]);
549:     VecCUDAResetArray(W->cv[1]);
550:   }
551:   VecCUDARestoreArrayRead(v->v,&d_pv);
552:   VecCUDARestoreArrayWrite(w->v,&d_pw);
553:   return(0);
554: }

556: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
557: {
558:   PetscErrorCode    ierr;
559:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
560:   const PetscScalar *d_pv,*d_pvc;
561:   PetscScalar       *d_pw,*d_pwc;
562:   cudaError_t       err;

565:   VecCUDAGetArrayRead(v->v,&d_pv);
566:   VecCUDAGetArrayWrite(w->v,&d_pw);
567:   d_pvc = d_pv+(V->nc+V->l)*V->n;
568:   d_pwc = d_pw+(W->nc+W->l)*W->n;
569:   err = cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
570:   VecCUDARestoreArrayRead(v->v,&d_pv);
571:   VecCUDARestoreArrayWrite(w->v,&d_pw);
572:   return(0);
573: }

575: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
576: {
578:   BV_SVEC        *v = (BV_SVEC*)V->data;
579:   PetscScalar    *d_pv;
580:   cudaError_t    err;

583:   VecCUDAGetArray(v->v,&d_pv);
584:   err = cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
585:   VecCUDARestoreArray(v->v,&d_pv);
586:   return(0);
587: }

589: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
590: {
591:   PetscErrorCode    ierr;
592:   BV_SVEC           *ctx = (BV_SVEC*)bv->data;
593:   const PetscScalar *d_pv;
594:   PetscScalar       *d_pnew,*d_ptr;
595:   PetscInt          bs,lsplit;
596:   Vec               vnew,vpar;
597:   char              str[50];
598:   cudaError_t       err;
599:   BV                parent;

602:   if (bv->issplit==2) {
603:     parent = bv->splitparent;
604:     lsplit = parent->lsplit;
605:     vpar = ((BV_SVEC*)parent->data)->v;
606:     VecCUDAResetArray(ctx->v);
607:     VecCUDAGetArray(vpar,&d_ptr);
608:     VecCUDAPlaceArray(ctx->v,d_ptr+lsplit*bv->n);
609:     VecCUDARestoreArray(vpar,&d_ptr);
610:   } else if (!bv->issplit) {
611:     VecGetBlockSize(bv->t,&bs);
612:     VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew);
613:     VecSetType(vnew,((PetscObject)bv->t)->type_name);
614:     VecSetSizes(vnew,m*bv->n,PETSC_DECIDE);
615:     VecSetBlockSize(vnew,bs);
616:     PetscLogObjectParent((PetscObject)bv,(PetscObject)vnew);
617:     if (((PetscObject)bv)->name) {
618:       PetscSNPrintf(str,50,"%s_0",((PetscObject)bv)->name);
619:       PetscObjectSetName((PetscObject)vnew,str);
620:     }
621:     if (copy) {
622:       VecCUDAGetArrayRead(ctx->v,&d_pv);
623:       VecCUDAGetArrayWrite(vnew,&d_pnew);
624:       err = cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
625:       VecCUDARestoreArrayRead(ctx->v,&d_pv);
626:       VecCUDARestoreArrayWrite(vnew,&d_pnew);
627:     }
628:     VecDestroy(&ctx->v);
629:     ctx->v = vnew;
630:   }
631:   return(0);
632: }

634: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
635: {
637:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
638:   PetscScalar    *d_pv;
639:   PetscInt       l;

642:   l = BVAvailableVec;
643:   VecCUDAGetArray(ctx->v,&d_pv);
644:   VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n);
645:   return(0);
646: }

648: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
649: {
651:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
652:   PetscInt       l;

655:   l = (j==bv->ci[0])? 0: 1;
656:   VecCUDAResetArray(bv->cv[l]);
657:   VecCUDARestoreArray(ctx->v,NULL);
658:   return(0);
659: }

661: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
662: {
663:   PetscErrorCode    ierr;
664:   Vec               v;
665:   const PetscScalar *d_pv;
666:   PetscObjectState  lstate,rstate;
667:   PetscBool         change=PETSC_FALSE;

670:   /* force sync flag to PETSC_CUDA_BOTH */
671:   if (L) {
672:     PetscObjectStateGet((PetscObject)*L,&lstate);
673:     if (lstate != bv->lstate) {
674:       v = ((BV_SVEC*)bv->L->data)->v;
675:       VecCUDAGetArrayRead(v,&d_pv);
676:       VecCUDARestoreArrayRead(v,&d_pv);
677:       change = PETSC_TRUE;
678:     }
679:   }
680:   if (R) {
681:     PetscObjectStateGet((PetscObject)*R,&rstate);
682:     if (rstate != bv->rstate) {
683:       v = ((BV_SVEC*)bv->R->data)->v;
684:       VecCUDAGetArrayRead(v,&d_pv);
685:       VecCUDARestoreArrayRead(v,&d_pv);
686:       change = PETSC_TRUE;
687:     }
688:   }
689:   if (change) {
690:     v = ((BV_SVEC*)bv->data)->v;
691:     VecCUDAGetArray(v,(PetscScalar **)&d_pv);
692:     VecCUDARestoreArray(v,(PetscScalar **)&d_pv);
693:   }
694:   return(0);
695: }