Batched FFTs by cufftPlanMany

The cuFFT library enables the possibility of performing batched FFTs, that is, execute a number of FFTs with one only cufftExec call. In this case, the cuFFT plan must be set by cufftPlanMany.
One apparently annoying step is correctly setting the parameters of cufftPlanMany.
A description of the parameters of cufftPlanMany is contained in the cuFFT guide; reading Section 2.6 “Advanced Data Layout” is highly recommended.
Below is a full code showing how to set them to perform both the direct and inverse FFTs of a number of batch 2D matrices.
Note that the final result of the inverse FFT coincides with the input sequence but for a multiplicative constant representing the overall number of matrix elements.

#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include <assert.h>

/********************/
/* CUDA ERROR CHECK */
/********************/

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
      if (abort) { getchar(); exit(code); }
   }
}

/*********************/
/* CUFFT ERROR CHECK */
/*********************/

static const char *_cudaGetErrorEnum(cufftResult error)
{

   switch (error)
   {

      case CUFFT_SUCCESS:
      return "CUFFT_SUCCESS";

      case CUFFT_INVALID_PLAN:
      return "CUFFT_INVALID_PLAN";

      case CUFFT_ALLOC_FAILED:
      return "CUFFT_ALLOC_FAILED";

      case CUFFT_INVALID_TYPE:
      return "CUFFT_INVALID_TYPE";

      case CUFFT_INVALID_VALUE:
      return "CUFFT_INVALID_VALUE";

      case CUFFT_INTERNAL_ERROR:
      return "CUFFT_INTERNAL_ERROR";

      case CUFFT_EXEC_FAILED:
      return "CUFFT_EXEC_FAILED";

      case CUFFT_SETUP_FAILED:
      return "CUFFT_SETUP_FAILED";

      case CUFFT_INVALID_SIZE:
      return "CUFFT_INVALID_SIZE";

      case CUFFT_UNALIGNED_DATA:
      return "CUFFT_UNALIGNED_DATA";
   }

   return "<unknown>";
}

#define cufftSafeCall(err)      __cufftSafeCall(err, __FILE__, __LINE__)

inline void __cufftSafeCall(cufftResult err, const char *file, const int line)
{

   if( CUFFT_SUCCESS != err) {
      fprintf(stderr, "CUFFT error in file '%s', line %dn %snerror %d: %snterminating!n",__FILE__, __LINE__,err, 
      _cudaGetErrorEnum(err)); 
      cudaDeviceReset(); assert(0); 
   }
}

/********/
/* MAIN */
/********/

void main() {
   cufftHandle forward_plan, inverse_plan;

   int batch = 3;
   int rank = 2;
   int nRows = 6;
   int nCols = 6;
   int n[2] = {nRows, nCols};

   int idist = nRows*nCols;
   int odist = nRows*(nCols/2+1);

   int inembed[] = {nRows, nCols};
   int onembed[] = {nRows, nCols/2+1};

   int istride = 1;
   int ostride = 1;

   cufftSafeCall(cufftPlanMany(&forward_plan,      rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch));

   float *h_in = (float*)malloc(sizeof(float)*nRows*nCols*batch);

   for(int i=0; i<nRows*nCols*batch; i++) h_in[i] = 1.f;

   float2* h_freq = (float2*)malloc(sizeof(float2)*nRows*(nCols/2+1)*batch);

   float* d_in;              
   gpuErrchk(cudaMalloc(&d_in, sizeof(float)*nRows*nCols*batch));

   float2* d_freq;     
   gpuErrchk(cudaMalloc(&d_freq, sizeof(float2)*nRows*(nCols/2+1)*batch));

   gpuErrchk(cudaMemcpy(d_in,h_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyHostToDevice));
   cufftSafeCall(cufftExecR2C(forward_plan, d_in, d_freq));

   gpuErrchk(cudaMemcpy(h_freq,d_freq,sizeof(float2)*nRows*(nCols/2+1)*batch,cudaMemcpyDeviceToHost));

   for(int i=0; i<nRows*(nCols/2+1)*batch; i++) printf("Direct transform: %i %f %fn",i,h_freq[i].x,h_freq[i].y);

   cufftSafeCall(cufftPlanMany(&inverse_plan, rank, n, onembed, ostride, odist, inembed, istride, idist, CUFFT_C2R, batch));

   cufftSafeCall(cufftExecC2R(inverse_plan, d_freq, d_in));

   gpuErrchk(cudaMemcpy(h_in,d_in,sizeof(float)*nRows*nCols*batch,cudaMemcpyDeviceToHost));

   for(int i=0; i<nRows*nCols*batch; i++) printf("Inverse transform: %i %f n",i,h_in[i]);

   getchar();

}

One thought on “Batched FFTs by cufftPlanMany

  1. Julia says:

    Hi, is it possible to do a batched 2D fft of 1D fft without a loop?

    https://stackoverflow.com/questions/26918101/1d-ffts-of-columns-and-rows-of-a-3d-matrix-in-cuda

    This is the example I am thinking of. Trying to do ID fft along x, then along y for all of z. Is there a way to accomplish along x without the loop through all of z?

Leave a Reply

Your email address will not be published. Required fields are marked *