# 1D FFTs of columns and rows of a 3D matrix in CUDA

In this post we explore the possibility of using a single cufftPlanMany to perform 1D FFTs of the columns of a 3D matrix.
Transformations performed according to cufftPlanMany, that is called like

`cufftPlanMany(&handle, rank, n, inembed, istride, idist, onembed, ostride, odist, CUFFT_C2C, batch);`

must obey the Advanced Data Layout . In particular, 1D FFTs are worked out according to the following layout

`input[b * idist + x * istride]`

where b addresses the b-th signal and istride is the distance between two consecutive items in the same signal.

If the 3D matrix has dimensions M * N * Q and if you want to perform 1D transforms along the columns, then the distance between two consecutive elements will be M, while the distance between two consecutive signals will be 1.
Furthermore, the number of batched executions must be set equal to M.
With those parameters, one is able to cover only one slice of the 3D matrix.

Indeed, if one tries increasing M, then the cuFFT will start trying to compute new column-wise FFTs starting from the second row. The only solution to this problem is an iterative call to cufftExecC2C to cover all the Q slices.

For the record, the following code provides a fully worked example on how performing 1D FFTs of the columns of a 3D matrix.

```#include <thrust/device_vector.h>
#include <cufft.h>

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

int main() {

const int M = 3;
const int N = 4;
const int Q = 2;

thrust::host_vector<float2> h_matrix(M * N * Q);

for (int k=0; k<Q; k++)
for (int j=0; j<N; j++)
for (int i=0; i<M; i++) {
float2 temp;
temp.x = (float)(j + k * M);
//temp.x = 1.f;
temp.y = 0.f;
h_matrix[k*M*N+j*M+i] = temp;
printf("%i %i %i %f %fn", i, j, k, temp.x, temp.y);
}
printf("n");

thrust::device_vector<float2> d_matrix(h_matrix);

thrust::device_vector<float2> d_matrix_out(M * N * Q);

//     input[b * idist + x * istride]
//     output[b * odist + x * ostride]
//     b = signal number
//     x = element of the b-th signal

cufftHandle handle;
int rank = 1;                           // --- 1D FFTs
int n[] = { N };                        // --- Size of the Fourier transform
int istride = M, ostride = M;           // --- Distance between two successive input/output elements
int idist = 1, odist = 1;               // --- Distance between batches
int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
int batch = M;                          // --- Number of batched executions
cufftPlanMany(&handle, rank, n,
inembed, istride, idist,
onembed, ostride, odist, CUFFT_C2C, batch);

for (int k=0; k<Q; k++)
cufftExecC2C(handle, (cufftComplex*)(thrust::raw_pointer_cast(d_matrix.data()) + k * M * N), (cufftComplex*)(thrust::raw_pointer_cast(d_matrix_out.data()) + k * M * N), CUFFT_FORWARD);
cufftDestroy(handle);

for (int k=0; k<Q; k++)
for (int j=0; j<N; j++)
for (int i=0; i<M; i++) {
float2 temp = d_matrix_out[k*M*N+j*M+i];
printf("%i %i %i %f %fn", i, j, k, temp.x, temp.y);
}

}```

The situation is different for the case when one wants to perform 1D transforms of the rows.
In that case, the distance between two consecutive elements is 1, while the distance between two consecutive signals is M.
This allows you to set a number of N * Q transformations and then invoking cufftExecC2C only one time. For the record, the code below provides a full example of 1D transformations of the rows of a 3D matrix.

```#include <thrust/device_vector.h>
#include <cufft.h>

/********************/
/* CUDA ERROR CHECK */
/********************/

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}

int main() {

const int M = 3;
const int N = 4;
const int Q = 2;

thrust::host_vector<float2> h_matrix(M * N * Q);

for (int k=0; k<Q; k++)
for (int j=0; j<N; j++)
for (int i=0; i<M; i++) {
float2 temp;
temp.x = (float)(j + k * M);
//temp.x = 1.f;
temp.y = 0.f;
h_matrix[k*M*N+j*M+i] = temp;
printf("%i %i %i %f %fn", i, j, k, temp.x, temp.y);
}
printf("n");

thrust::device_vector<float2> d_matrix(h_matrix);

thrust::device_vector<float2> d_matrix_out(M * N * Q);

//     input[b * idist + x * istride]
//     output[b * odist + x * ostride]
//     b = signal number
//     x = element of the b-th signal

cufftHandle handle;
int rank = 1;                           // --- 1D FFTs
int n[] = { M };                        // --- Size of the Fourier transform
int istride = 1, ostride = 1;           // --- Distance between two successive input/output elements
int idist = M, odist = M;               // --- Distance between batches
int inembed[] = { 0 };                  // --- Input size with pitch (ignored for 1D transforms)
int onembed[] = { 0 };                  // --- Output size with pitch (ignored for 1D transforms)
int batch = N * Q;                      // --- Number of batched executions
cufftPlanMany(&handle, rank, n,
inembed, istride, idist,
onembed, ostride, odist, CUFFT_C2C, batch);

cufftExecC2C(handle, (cufftComplex*)(thrust::raw_pointer_cast(d_matrix.data())), (cufftComplex*)(thrust::raw_pointer_cast(d_matrix_out.data())), CUFFT_FORWARD);
cufftDestroy(handle);

for (int k=0; k<Q; k++)
for (int j=0; j<N; j++)
for (int i=0; i<M; i++) {
float2 temp = d_matrix_out[k*M*N+j*M+i];
printf("%i %i %i %f %fn", i, j, k, temp.x, temp.y);
}

}```

## One thought on “1D FFTs of columns and rows of a 3D matrix in CUDA”

1. Z says:

Is it possible to do this without the loop for the column 1D ffts?