Actual source code: bvorthogcuda.cu
slepc-3.12.1 2019-11-08
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2019, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV orthogonalization routines (CUDA)
12: */
14: #include <slepc/private/bvimpl.h> /*I "slepcbv.h" I*/
15: #include <slepcblaslapack.h>
16: #include <petsccublas.h>
18: /*
19: BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
20: */
21: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
22: {
24: PetscScalar *d_hh,*d_a;
25: PetscInt i;
28: if (!h) {
29: VecCUDAGetArray(bv->buffer,&d_a);
30: d_hh = d_a + j*(bv->nc+bv->m);
31: cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar));
32: WaitForGPU();CHKERRCUDA(ierr);
33: VecCUDARestoreArray(bv->buffer,&d_a);
34: } else { /* cpu memory */
35: for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
36: }
37: return(0);
38: }
40: /*
41: BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
42: into column j of the bv buffer
43: */
44: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
45: {
47: PetscScalar *d_h,*d_c,sone=1.0;
48: PetscInt i;
49: PetscBLASInt one=1;
50: cublasStatus_t cberr;
51: cublasHandle_t cublasv2handle;
54: if (!h) {
55: PetscCUBLASGetHandle(&cublasv2handle);
56: VecCUDAGetArray(bv->buffer,&d_c);
57: d_h = d_c + j*(bv->nc+bv->m);
58: PetscLogGpuTimeBegin();
59: cberr = cublasXaxpy(cublasv2handle,bv->nc+j,&sone,d_c,one,d_h,one);CHKERRCUBLAS(cberr);
60: PetscLogGpuTimeEnd();
61: PetscLogGpuFlops(1.0*bv->nc+j);
62: WaitForGPU();CHKERRCUDA(ierr);
63: VecCUDARestoreArray(bv->buffer,&d_c);
64: } else { /* cpu memory */
65: for (i=0;i<bv->nc+j;i++) h[i] += c[i];
66: PetscLogFlops(1.0*bv->nc+j);
67: }
68: return(0);
69: }
71: /*
72: BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
73: of the coefficients array
74: */
75: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
76: {
78: PetscScalar *d_h,*a;
79: cudaError_t cerr;
82: if (!h) {
83: VecCUDAGetArray(bv->buffer,&a);
84: d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
85: cerr = cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
86: PetscLogCpuToGpu(sizeof(PetscScalar));
87: WaitForGPU();CHKERRCUDA(ierr);
88: VecCUDARestoreArray(bv->buffer,&a);
89: } else { /* cpu memory */
90: h[bv->nc+j] = value;
91: }
92: return(0);
93: }
95: /*
96: BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
97: coefficients array (up to position j)
98: */
99: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
100: {
101: PetscErrorCode ierr;
102: const PetscScalar *d_h;
103: PetscScalar dot;
104: PetscInt i;
105: PetscBLASInt one=1;
106: cublasStatus_t cberr;
107: cublasHandle_t cublasv2handle;
110: if (!h) {
111: PetscCUBLASGetHandle(&cublasv2handle);
112: VecCUDAGetArrayRead(bv->buffer,&d_h);
113: PetscLogGpuTimeBegin();
114: cberr = cublasXdotc(cublasv2handle,bv->nc+j,d_h,one,d_h,one,&dot);CHKERRCUBLAS(cberr);
115: PetscLogGpuTimeEnd();
116: PetscLogGpuFlops(2.0*bv->nc+j);
117: WaitForGPU();CHKERRCUDA(ierr);
118: *sum = PetscRealPart(dot);
119: VecCUDARestoreArrayRead(bv->buffer,&d_h);
120: } else { /* cpu memory */
121: *sum = 0.0;
122: for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
123: PetscLogFlops(2.0*bv->nc+j);
124: }
125: return(0);
126: }
128: #define X_AXIS 0
129: #define BLOCK_SIZE_X 64
130: #define TILE_SIZE_X 16 /* work to be done by any thread on axis x */
132: /*
133: Set the kernels grid dimensions
134: xcount: number of kernel calls needed for the requested size
135: */
136: PetscErrorCode SetGrid1D(PetscInt n, dim3 *dimGrid, dim3 *dimBlock,PetscInt *xcount)
137: {
138: PetscInt one=1,card;
139: struct cudaDeviceProp devprop;
140: cudaError_t cerr;
143: *xcount = 1;
144: if (n>BLOCK_SIZE_X) {
145: dimBlock->x = BLOCK_SIZE_X;
146: dimGrid->x = (n+BLOCK_SIZE_X*TILE_SIZE_X-one)/BLOCK_SIZE_X*TILE_SIZE_X;
147: } else {
148: dimBlock->x = (n+TILE_SIZE_X-one)/TILE_SIZE_X;
149: dimGrid->x = one;
150: }
151: cerr = cudaGetDevice(&card);CHKERRCUDA(cerr);
152: cerr = cudaGetDeviceProperties(&devprop,card);CHKERRCUDA(cerr);
153: if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
154: *xcount = (dimGrid->x+devprop.maxGridSize[X_AXIS]-one)/devprop.maxGridSize[X_AXIS];
155: dimGrid->x = devprop.maxGridSize[X_AXIS];
156: }
157: return(0);
158: }
160: /* pointwise multiplication */
161: __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
162: {
163: PetscInt i,x;
165: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
166: for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
167: a[i] *= PetscRealPart(b[i]);
168: }
169: }
171: /* pointwise division */
172: __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
173: {
174: PetscInt i,x;
176: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
177: for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
178: a[i] /= PetscRealPart(b[i]);
179: }
180: }
182: /*
183: BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
184: the contents of the coefficients array (up to position j) and omega is the signature;
185: if inverse=TRUE then the operation is h/omega
186: */
187: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
188: {
189: PetscErrorCode ierr;
190: PetscScalar *d_h;
191: const PetscScalar *d_omega,*omega;
192: PetscInt i,xcount;
193: dim3 blocks3d, threads3d;
194: cudaError_t cerr;
197: if (!(bv->nc+j)) return(0);
198: if (!h) {
199: VecCUDAGetArray(bv->buffer,&d_h);
200: VecCUDAGetArrayRead(bv->omega,&d_omega);
201: SetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount);
202: PetscLogGpuTimeBegin();
203: if (inverse) {
204: for (i=0;i<xcount;i++) {
205: PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
206: }
207: } else {
208: for (i=0;i<xcount;i++) {
209: PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
210: }
211: }
212: cerr = cudaGetLastError();CHKERRCUDA(cerr);
213: PetscLogGpuTimeEnd();
214: PetscLogGpuFlops(1.0*bv->nc+j);
215: WaitForGPU();CHKERRCUDA(ierr);
216: VecCUDARestoreArrayRead(bv->omega,&d_omega);
217: VecCUDARestoreArray(bv->buffer,&d_h);
218: } else {
219: VecGetArrayRead(bv->omega,&omega);
220: if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
221: else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
222: VecRestoreArrayRead(bv->omega,&omega);
223: PetscLogFlops(1.0*bv->nc+j);
224: }
225: return(0);
226: }
228: /*
229: BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
230: of the coefficients array
231: */
232: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
233: {
234: PetscErrorCode ierr;
235: const PetscScalar *d_h;
236: PetscScalar hh;
237: cudaError_t cerr;
240: if (!h) {
241: VecCUDAGetArrayRead(bv->buffer,&d_h);
242: cerr = cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
243: PetscLogGpuToCpu(sizeof(PetscScalar));
244: WaitForGPU();CHKERRCUDA(ierr);
245: BV_SafeSqrt(bv,hh,beta);
246: VecCUDARestoreArrayRead(bv->buffer,&d_h);
247: } else {
248: BV_SafeSqrt(bv,h[bv->nc+j],beta);
249: }
250: return(0);
251: }
253: /*
254: BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
255: provided by the caller (only values from l to j are copied)
256: */
257: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
258: {
259: PetscErrorCode ierr;
260: const PetscScalar *d_h,*d_a;
261: PetscInt i;
262: cudaError_t cerr;
265: if (!h) {
266: VecCUDAGetArrayRead(bv->buffer,&d_a);
267: d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
268: cerr = cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
269: PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar));
270: WaitForGPU();CHKERRCUDA(ierr);
271: VecCUDARestoreArrayRead(bv->buffer,&d_a);
272: } else {
273: for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
274: }
275: return(0);
276: }