Tricks and Tips: cudaMalloc3D and cudaMemcpy3D

When accessing 3D arrays in CUDA, memory transactions are much faster if the data elements properly aligned.
CUDA provides the cudaMalloc3D function to “pad” the elements of 3D matrices with extra bytes so to achieve the desired alignment.
Refer to the “CUDA C Programming Guide“, Sections 3.2.2 and 5.3.2, for more information.
Such a function returns also the allocation pitches, which can be subsequently used by cudaMemcpy3D to properly access the data.

Below, a full code is reported showing how to allocate 3D memory by cudaMalloc3D, moving a host allocated 1D memory to 3D device memory by cudaMemcpy3D, performing some operations on the 3Ddevice data by the test_kernel_3D __global__ function and moving the 3D result data back to 1D host memory, again by cudaMemcpy3D.
The __global__ function test_kernel_3D squares each element of the 3D device memory. In particular, each thread of a 2D grid takes care of performing a for loop along the “depth” dimension.

#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<conio.h>

#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 16

#define N 128
#define M 64
#define W 16



/*****************/
/* CUDA MEMCHECK */
/*****************/

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{

   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);

       if (abort) { getch(); exit(code); }
    }
}



/*******************/
/* iDivUp FUNCTION */
/*******************/

int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }


/******************/
/* TEST KERNEL 3D */
/******************/

__global__ void test_kernel_3D(cudaPitchedPtr devPitchedPtr)
{

   int tidx = blockIdx.x*blockDim.x+threadIdx.x;
   int tidy = blockIdx.y*blockDim.y+threadIdx.y;

   char* devPtr = (char*) devPitchedPtr.ptr;
   size_t pitch = devPitchedPtr.pitch;
   size_t slicePitch = pitch * N;

   for (int w = 0; w < W; w++)
   {
      char* slice = devPtr + w * slicePitch;
      float* row = (float*)(slice + tidy * pitch);
      row[tidx] = row[tidx] * row[tidx];
   }
}


/********/
/* MAIN */
/********/

int main()
{
   float a[N][M][W];

for (int i=0; i<N; i++)
   for (int j=0; j<M; j++)
      for (int w=0; w<W; w++)
      {
           a[i][j][w] = 3.f;
           //printf("row %i column %i depth %i value %f n",i,j,w,a[i][j][w]);
      }



   // --- 3D pitched allocation and host->device memcopy
   cudaExtent extent = make_cudaExtent(M * sizeof(float), N, W);

   cudaPitchedPtr devPitchedPtr;

   gpuErrchk(cudaMalloc3D(&devPitchedPtr, extent));

   cudaMemcpy3DParms p = { 0 };

   p.srcPtr.ptr = a;
   p.srcPtr.pitch = M * sizeof(float);
   p.srcPtr.xsize = M;
   p.srcPtr.ysize = N;
   p.dstPtr.ptr = devPitchedPtr.ptr;
   p.dstPtr.pitch = devPitchedPtr.pitch;
   p.dstPtr.xsize = M;
   p.dstPtr.ysize = N;
   p.extent.width = M * sizeof(float);
   p.extent.height = N;
   p.extent.depth = W;
   p.kind = cudaMemcpyHostToDevice;

   gpuErrchk(cudaMemcpy3D(&p));

   dim3 GridSize(iDivUp(M,BLOCKSIZE_x),iDivUp(N,BLOCKSIZE_y));

   dim3 BlockSize(BLOCKSIZE_y,BLOCKSIZE_x);

   test_kernel_3D<<<GridSize,BlockSize>>>(devPitchedPtr);

   gpuErrchk(cudaPeekAtLastError());

   gpuErrchk(cudaDeviceSynchronize());


   p.srcPtr.ptr = devPitchedPtr.ptr;
   p.srcPtr.pitch = devPitchedPtr.pitch;
   p.dstPtr.ptr = a;
   p.dstPtr.pitch = M * sizeof(float);
   p.kind = cudaMemcpyDeviceToHost;

   gpuErrchk(cudaMemcpy3D(&p));



   for (int i=0; i<N; i++)
      for (int j=0; j<M; j++)
         for (int w=0; w<W; w++)
            printf("row %i column %i depth %i value %fn",i,j,w,a[i][j][w]);



   getch();
   return 0;
}

Leave a Reply

Your email address will not be published. Required fields are marked *