Tricks and Tips: cudaMallocPitch and cudaMemcpy2D

When accessing 2D arrays in CUDA, memory transactions are much faster if each row is properly aligned.
CUDA provides the cudaMallocPitch function to “pad” 2D matrix rows with extra bytes so to achieve the desired alignment. Please, refer to the “CUDA C Programming Guide”, Sections 3.2.2 and 5.3.2, for more information.
Such a function returns also the allocation pitch, which can be subsequently used by cudaMemcpy2D to properly access the data.

Below is a simple usage example:


#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<conio.h>
 
#define BLOCKSIZE_x 16
#define BLOCKSIZE_y 16
 
#define N 256
#define M 256
 

/*****************/
/* CUDA MEMCHECK */
/*****************/

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{

   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
      if (abort) { getch(); exit(code); }
   }
}

 

/*******************/
/* iDivUp FUNCTION */
/*******************/

int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }



/******************/
/* TEST KERNEL 2D */
/******************/

__global__ void test_kernel_2D(float* d_a, size_t pitch)
{

int    tidx = blockIdx.x*blockDim.x+threadIdx.x;
int    tidy = blockIdx.y*blockDim.y+threadIdx.y;
 

if ((tidx<M) && (tidy<N))
   {
      float* row_a = (float*)((char*)d_a + tidx*pitch);
      row_a[tidy] = row_a[tidy] * row_a[tidy];
   }
}

 
/********/
/* MAIN */
/********/

int main()
{
   float a[N][M];
   float *d_a;
   size_t pitch;
 

for (int i=0; i<N; i++)
    for (int j=0; j<M; j++) {
         a[i][j] = 3.f;
         printf("row %i column %i value %f n",i,j,a[i][j]);
     }


// --- 2D pitched allocation and host->device memcopy

   gpuErrchk(cudaMallocPitch(&d_a,&pitch,M*sizeof(float),N));

   gpuErrchk(cudaMemcpy2D(d_a,pitch,a,M*sizeof(float),M*sizeof(float),N,cudaMemcpyHostToDevice));

   dim3 GridSize1(iDivUp(M,BLOCKSIZE_x),iDivUp(N,BLOCKSIZE_y));

   dim3 BlockSize1(BLOCKSIZE_y,BLOCKSIZE_x);

   test_kernel_2D<<<GridSize1,BlockSize1>>>(d_a,pitch);

   gpuErrchk(cudaPeekAtLastError());

   gpuErrchk(cudaDeviceSynchronize());

   gpuErrchk(cudaMemcpy2D(a,M*sizeof(float),d_a,pitch,M*sizeof(float),N,cudaMemcpyDeviceToHost));

   for (int i=0; i<N; i++) for (int j=0; j<M; j++) printf("row %i column %i value %fn",i,j,a[i][j]);

   getch();

   return 0;

}

5 thoughts on “Tricks and Tips: cudaMallocPitch and cudaMemcpy2D

  1. Maksudul Alam says:

    Great Article, helped me understanding using the pitch for rectangular grid. Thanks!

  2. Nayan Yengul says:

    Hello,
    This is very useful article. But I have one doubt, is is necessary to allocate 2d array on host statically? Can we allocate our host 2d array dynamically and then copy to device?
    Please suggest me solution.

    1. OrangeOwl says:

      Yes, the code can be employed either when the host array is statically or dynamically allocated.

    2. Liangchao Shang says:

      Sure, you can allocate host 2d array dynamically and cpoy to/from device

  3. Corey D says:

    One question:

    in the kernel function, should
    (tidx<M) && (tidy<N)
    be
    (tidx<N) && (tidy<M) ?

Leave a Reply

Your email address will not be published. Required fields are marked *