1D Finite Difference Time Domain (FDTD) in CUDA for the Helmholtz equation

There is a question on whether 1D Finite Difference Time Domain (FDTD) method can be faster when implemented C/C++ and run on a sequential machine rather than when implemented in CUDA and run on a parallel GPU.

We are trying to answer this question with the below code.
It contains both an implementation of the 1D FDTD method for an electromagnetic application in C/C++ and CUDA.
Theory and C/C++ implementations are taken from Understanding the Finite-Difference Time-Domain Method (see Program 3.1).

The CUDA version contains two approaches, one using global memory only and one using shared memory.
In the latter case, we enforce synchronization between magnetic field and electric field updates by launching two different kernels.
For a sufficiently large problem (SIZE = 10000000), the GPU version is indeed faster than the CPU one. I have tested the code on a Kepler K20c card and these are the results:

Shared Memory version
CPU elapsed time = 3980.763 ms
GPU elapsed time = 356.828 ms

Global Memory version
GPU elapsed time = 359.768 ms

The version using shared memory does not improve the scenario.
Here is the code:

kernel.cu

/* 1D FDTD simulation with an additive source. */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include "TimingCPU.h"
#include "TimingGPU.cuh"

#define BLOCKSIZE   512
//#define DEBUG

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %dn", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

/***********************************/
/* HOST-SIZE FIELD UPDATE FUNCTION */
/***********************************/
void updateHost(double *h_ez, double* h_hy, double imp0, double qTime, const int source, const int N) {

    /* update magnetic field */
    for (int mm = 0; mm < N - 1; mm++)
        h_hy[mm] = h_hy[mm] + (h_ez[mm + 1] - h_ez[mm]) / imp0;

    /* update electric field */
    for (int mm = 1; mm < N; mm++)
        h_ez[mm] = h_ez[mm] + (h_hy[mm] - h_hy[mm - 1]) * imp0;

    /* use additive source at node 50 */
    h_ez[source] += exp(-(qTime - 30.) * (qTime - 30.) / 100.);

}

/********************************************************/
/* DEVICE-SIZE FIELD UPDATE FUNCTION - NO SHARED MEMORY */
/********************************************************/
__global__ void updateDevice_v0(double *d_ez, double* d_hy, double imp0, double qTime, const int source, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    /* update magnetic field */
    if (tid < N-1) d_hy[tid] = d_hy[tid] + (d_ez[tid + 1] - d_ez[tid]) / imp0;

    __threadfence();

    /* update electric field */
    if ((tid < N)&&(tid > 0)) d_ez[tid] = d_ez[tid] + (d_hy[tid] - d_hy[tid - 1]) * imp0;

    /* use additive source at node 50 */
    if (tid == source) d_ez[tid] += exp(-(qTime - 30.) * (qTime - 30.) / 100.);

}

/**************************************************************/
/* DEVICE-SIZE MAGNETIC FIELD UPDATE FUNCTION - SHARED MEMORY */
/**************************************************************/
__global__ void updateDevice_hy(double *d_ez, double* d_hy, double imp0, double qTime, const int source, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    __shared__ double hy_temp[BLOCKSIZE+1], ez_temp[BLOCKSIZE+1];

    hy_temp[threadIdx.x] = d_hy[tid];
    ez_temp[threadIdx.x] = d_ez[tid];

    if ((threadIdx.x == 0)&&((tid + BLOCKSIZE) < N)) {
        ez_temp[BLOCKSIZE] = d_ez[tid + BLOCKSIZE];
        hy_temp[BLOCKSIZE] = d_hy[tid + BLOCKSIZE];
    }

    __syncthreads();

    /* update magnetic field */
    if (tid < N-1) d_hy[tid] = hy_temp[threadIdx.x] + (ez_temp[threadIdx.x + 1] - ez_temp[threadIdx.x]) / imp0;

}

/**************************************************************/
/* DEVICE-SIZE ELECTRIC FIELD UPDATE FUNCTION - SHARED MEMORY */
/**************************************************************/
__global__ void updateDevice_ez(double *d_ez, double* d_hy, double imp0, double qTime, const int source, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    __shared__ double hy_temp[BLOCKSIZE+1], ez_temp[BLOCKSIZE+1];

    hy_temp[threadIdx.x + 1] = d_hy[tid];
    ez_temp[threadIdx.x + 1] = d_ez[tid];

    if ((threadIdx.x == 0)&&(tid >= 1)) {
        ez_temp[0] = d_ez[tid - 1];
        hy_temp[0] = d_hy[tid - 1];
    }

    __syncthreads();

    /* update electric field */
    ez_temp[threadIdx.x] = ez_temp[threadIdx.x + 1] + (hy_temp[threadIdx.x + 1] - hy_temp[threadIdx.x]) * imp0;

    /* use additive source at node 50 */
    if (tid == source) ez_temp[threadIdx.x] += exp(-(qTime - 30.) * (qTime - 30.) / 100.);

    if ((tid < N)&&(tid > 0)) d_ez[tid] = ez_temp[threadIdx.x];

}

/********/
/* MAIN */
/********/
int main() {

    // --- Problem size
    const int SIZE = 10000000;

    // --- Free-space wave impedance
    double imp0 = 377.0;

    // --- Maximum number of iterations (must be less than the problem size)
    int maxTime = 100;

    // --- Source location
    int source = SIZE / 2;

    // --- Host side memory allocations and initializations
    double *h_ez = (double*)calloc(SIZE, sizeof(double));
    double *h_hy = (double*)calloc(SIZE, sizeof(double));

    // --- Device side memory allocations and initializations
    double *d_ez; gpuErrchk(cudaMalloc((void**)&d_ez, SIZE * sizeof(double)));
    double *d_hy; gpuErrchk(cudaMalloc((void**)&d_hy, SIZE * sizeof(double)));
    gpuErrchk(cudaMemset(d_ez, 0, SIZE * sizeof(double)));
    gpuErrchk(cudaMemset(d_hy, 0, SIZE * sizeof(double)));

    // --- Host side memory allocations for debugging purposes
#ifdef DEBUG
    double *h_ez_temp = (double*)calloc(SIZE, sizeof(double));
    double *h_hy_temp = (double*)calloc(SIZE, sizeof(double));
#endif

    // --- Host-side time-steppings
#ifndef DEBUG
    TimingCPU timerCPU;
    timerCPU.StartCounter();
    for (int qTime = 0; qTime < maxTime; qTime++) {
        updateHost(h_ez, h_hy, imp0, qTime, source, SIZE);
    }
    printf("CPU elapsed time = %3.3f msn", timerCPU.GetCounter());
#endif

    TimingGPU timerGPU;
    timerGPU.StartCounter();
    // --- Device-side time-steppings
    for (int qTime = 0; qTime < maxTime; qTime++) {

        updateDevice_v0<<<iDivUp(SIZE, BLOCKSIZE), BLOCKSIZE>>>(d_ez, d_hy, imp0, qTime, source, SIZE);
//      updateDevice_hy<<<iDivUp(SIZE, BLOCKSIZE), BLOCKSIZE>>>(d_ez, d_hy, imp0, qTime, source, SIZE);
//      updateDevice_ez<<<iDivUp(SIZE, BLOCKSIZE), BLOCKSIZE>>>(d_ez, d_hy, imp0, qTime, source, SIZE);
#ifdef DEBUG
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
        gpuErrchk(cudaMemcpy(h_ez_temp, d_ez, SIZE * sizeof(double), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_hy_temp, d_hy, SIZE * sizeof(double), cudaMemcpyDeviceToHost));

        updateHost(h_ez, h_hy, imp0, qTime, source, SIZE);
        for (int i=0; i<SIZE; i++) {
            printf("%f %f %f %fn",h_ez_temp[i], h_ez[i], h_hy_temp[i], h_hy[i]);
        }
        printf("n");
#endif
    }
    printf("GPU elapsed time = %3.3f msn", timerGPU.GetCounter());

    return 0;
}

TimingCPU.h

#ifndef __TIMINGCPU_H__
#define __TIMINGCPU_H__

#ifdef __linux__

    class TimingCPU {

        private:
            long cur_time_;

        public:

            TimingCPU();

            ~TimingCPU();

            void StartCounter();

            double GetCounter();
    };

#elif _WIN32 || _WIN64

    struct PrivateTimingCPU;

    class TimingCPU
    {
        private:
            PrivateTimingCPU *privateTimingCPU;

        public:

            TimingCPU();

            ~TimingCPU();

            void StartCounter();

            double GetCounter();

    }; // TimingCPU class

#endif

#endif

TimingCPU.cpp

/**************/
/* TIMING CPU */
/**************/

#include "TimingCPU.h"

#ifdef __linux__

    #include <sys/time.h>
    #include <stdio.h>

    TimingCPU::TimingCPU(): cur_time_(0) { StartCounter(); }

    TimingCPU::~TimingCPU() { }

    void TimingCPU::StartCounter()
    {
        struct timeval time;
        if(gettimeofday( &time, 0 )) return;
        cur_time_ = 1000000 * time.tv_sec + time.tv_usec;
    }

    double TimingCPU::GetCounter()
    {
        struct timeval time;
        if(gettimeofday( &time, 0 )) return -1;

        long cur_time = 1000000 * time.tv_sec + time.tv_usec;
        double sec = (cur_time - cur_time_) / 1000000.0;
        if(sec < 0) sec += 86400;
        cur_time_ = cur_time;

        return 1000.*sec;
    }

#elif _WIN32 || _WIN64
    #include <windows.h>
    #include <iostream>

    struct PrivateTimingCPU {
        double  PCFreq;
        __int64 CounterStart;
    };

    // --- Default constructor
    TimingCPU::TimingCPU() { privateTimingCPU = new PrivateTimingCPU; (*privateTimingCPU).PCFreq = 0.0; (*privateTimingCPU).CounterStart = 0; }

    // --- Default destructor
    TimingCPU::~TimingCPU() { }

    // --- Starts the timing
    void TimingCPU::StartCounter()
    {
        LARGE_INTEGER li;
        if(!QueryPerformanceFrequency(&li)) std::cout << "QueryPerformanceFrequency failed!n";

        (*privateTimingCPU).PCFreq = double(li.QuadPart)/1000.0;

        QueryPerformanceCounter(&li);
        (*privateTimingCPU).CounterStart = li.QuadPart;
    }

    // --- Gets the timing counter in ms
    double TimingCPU::GetCounter()
    {
        LARGE_INTEGER li;
        QueryPerformanceCounter(&li);
        return double(li.QuadPart-(*privateTimingCPU).CounterStart)/(*privateTimingCPU).PCFreq;
    }
#endif

TimingGPU.cuh

#ifndef __TIMING_CUH__
#define __TIMING_CUH__

/**************/
/* TIMING GPU */
/**************/

// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
// microsecond precision.

struct PrivateTimingGPU;

class TimingGPU
{
    private:
        PrivateTimingGPU *privateTimingGPU;

    public:

        TimingGPU();

        ~TimingGPU();

        void StartCounter();
        void StartCounterFlags();

        float GetCounter();

}; // TimingCPU class

#endif 

TimingGPU.cu

/**************/
/* TIMING GPU */
/**************/

#include "TimingGPU.cuh"

#include <cuda.h>
#include <cuda_runtime.h>

struct PrivateTimingGPU {
    cudaEvent_t     start;
    cudaEvent_t     stop;
};

// default constructor
TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU; }

// default destructor
TimingGPU::~TimingGPU() { }

void TimingGPU::StartCounter()
{
    cudaEventCreate(&((*privateTimingGPU).start));
    cudaEventCreate(&((*privateTimingGPU).stop));
    cudaEventRecord((*privateTimingGPU).start,0);
}

void TimingGPU::StartCounterFlags()
{
    int eventflags = cudaEventBlockingSync;

    cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
    cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
    cudaEventRecord((*privateTimingGPU).start,0);
}

// Gets the counter in ms
float TimingGPU::GetCounter()
{
    float   time;
    cudaEventRecord((*privateTimingGPU).stop, 0);
    cudaEventSynchronize((*privateTimingGPU).stop);
    cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
    return time;
}

Leave a Reply

Your email address will not be published. Required fields are marked *