Tricks and Tips: 1D batched FFTs of real arrays

As reported in the cuFFT User Guide – CUDA 6.5, batch sizes other than 1 for cufftPlan1d() have been deprecated. Use cufftPlanMany() for multiple batch execution.

Below, a fully worked example using cufftPlanMany() instead of cufftPlan1d() is reported. As it can be seen,

int rank = 1; // --- 1D FFTs
int n[] = { DATASIZE }; // --- Size of the Fourier transform
int istride = 1, ostride = 1; // --- Distance between two successive input/output elements
int idist = DATASIZE, odist = (DATASIZE / 2 + 1); // --- Distance between batches
int inembed[] = { 0 }; // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 }; // --- Output size with pitch (ignored for 1D transforms)
int batch = BATCH; // --- Number of batched executions

cufftPlanMany(&handle, rank, n,
inembed, istride, idist,
onembed, ostride, odist, CUFFT_R2C, batch);

is totally equivalent to the “old fashioned”:

cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);

Here is the full example:

#include <cuda.h>
#include <cufft.h>
#include <stdio.h>
#include <math.h>

#define DATASIZE 8
#define BATCH 2

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/********/
/* MAIN */
/********/
int main ()
{
    // --- Host side input data allocation and initialization
    cufftReal *hostInputData = (cufftReal*)malloc(DATASIZE*BATCH*sizeof(cufftReal));
    for (int i=0; i<BATCH; i++)
        for (int j=0; j<DATASIZE; j++) hostInputData[i*DATASIZE + j] = (cufftReal)(i + 1);

    // --- Device side input data allocation and initialization
    cufftReal *deviceInputData; gpuErrchk(cudaMalloc((void**)&deviceInputData, DATASIZE * BATCH * sizeof(cufftReal)));
    cudaMemcpy(deviceInputData, hostInputData, DATASIZE * BATCH * sizeof(cufftReal), cudaMemcpyHostToDevice);

    // --- Host side output data allocation
    cufftComplex *hostOutputData = (cufftComplex*)malloc((DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex));

    // --- Device side output data allocation
    cufftComplex *deviceOutputData; gpuErrchk(cudaMalloc((void**)&deviceOutputData, (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex)));

    // --- Batched 1D FFTs
    cufftHandle handle;
    int rank = 1;                           // --- 1D FFTs
    int n[] = { DATASIZE };                 // --- Size of the Fourier transform
    int istride = 1, ostride = 1;           // --- Distance between two successive input/output elements
    int idist = DATASIZE, odist = (DATASIZE / 2 + 1); // --- Distance between batches
    int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
    int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
    int batch = BATCH;                      // --- Number of batched executions
    cufftPlanMany(&handle, rank, n,
                  inembed, istride, idist,
                  onembed, ostride, odist, CUFFT_R2C, batch);

    //cufftPlan1d(&handle, DATASIZE, CUFFT_R2C, BATCH);
    cufftExecR2C(handle,  deviceInputData, deviceOutputData);

    // --- Device->Host copy of the results
    gpuErrchk(cudaMemcpy(hostOutputData, deviceOutputData, (DATASIZE / 2 + 1) * BATCH * sizeof(cufftComplex), cudaMemcpyDeviceToHost));

    for (int i=0; i<BATCH; i++)
        for (int j=0; j<(DATASIZE / 2 + 1); j++)
            printf("%i %i %f %fn", i, j, hostOutputData[i*(DATASIZE / 2 + 1) + j].x, hostOutputData[i*(DATASIZE / 2 + 1) + j].y);

    cufftDestroy(handle);
    gpuErrchk(cudaFree(deviceOutputData));
    gpuErrchk(cudaFree(deviceInputData));

}

N.B. Add the cuFFT error checking according to http://stackoverflow.com/questions/16267149/cufft-error-handling to this code.

Leave a Reply

Your email address will not be published. Required fields are marked *