CUDA Timing for Multi-GPU Applications

Let us consider the example in last post where it has been underlined how using asynchronous copies enables achieving true multi-GPU concurrency. In particular, let us consider Test case #8 of that post.

The full code of Test case #8 is available on our GitHub website, while the profiler timeline is reported here for the sake of clarity:

0

Profiler timeline

The full code for the timing example here reported is available on our GitHub website.

Timing the asynchronous copies – concurrency is destroyed

Now, let us begin by timing the asynchronous copies. A possible way to do so, is using the following snippet:

float time[numGPUs];
cudaEvent_t start[numGPUs], stop[numGPUs];

// --- "Breadth-first" approach - async
for (int k = 0; k < numGPUs; k++) {

gpuErrchk(cudaSetDevice(k));
   cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
   cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
   cudaEventRecord(start[k], 0);
   gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double),      cudaMemcpyHostToDevice));
   cudaEventRecord(stop[k], 0);
   cudaEventSynchronize(stop[k]);
   cudaEventElapsedTime(&time[k], start[k], stop[k]);
}

for (int k = 0; k < numGPUs; k++) printf("Elapsed time:  %3.1f ms \n", time[k]);

Unfortunately, this way of timing destroys concurrency, as it is possible to appreciate from the profiler timeline below:

1

Profiler timeline

Timing the asynchronous copies – concurrency is preserved

To avoid this problem, a possibility is to launch the GPU tasks as OpenMP threads as follows:

int maxNumProcessors = omp_get_max_threads();
std::cout << "Maximum number of CPU threads = " << maxNumProcessors << std::endl;

// --- "Breadth-first" approach - async

omp_set_num_threads(numGPUs);
#pragma omp parallel
{
   unsigned int k = omp_get_thread_num();
   gpuErrchk(cudaSetDevice(k));
   cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
   cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
   cudaEventRecord(start[k], 0);

   gpuErrchk(cudaMemcpyAsync(plan[k].d_data, plan[k].h_data, NperGPU * sizeof(double),       cudaMemcpyHostToDevice));

   cudaEventRecord(stop[k], 0);
   cudaEventSynchronize(stop[k]);
   cudaEventElapsedTime(&time[k], start[k], stop[k]);

   printf("Thread nr. %i; Elapsed time:  %3.1f ms \n", k, time[k]);

}

As it can be seen from the profiler timeline, concurrency is preserved.

2

Profiler timeline

Timing the kernel launches – concurrency is destroyed

The same happens when timing the kernel launches. Using the following snippet, concurrency is destroyed.

for (int k = 0; k < numGPUs; k++) {
   gpuErrchk(cudaSetDevice(k));
   cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
   cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
   cudaEventRecord(start[k], 0);
   kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
   cudaEventRecord(stop[k], 0);
   cudaEventSynchronize(stop[k]);
   cudaEventElapsedTime(&time[k], start[k], stop[k]);
}

for (int k = 0; k < numGPUs; k++) printf("Elapsed time:  %3.1f ms \n", time[k]);
3

Profiler timeline

Timing the kernel launches – concurrency is preserved

Opposite to the above, using OpenMP, concurrency is preserved.

int maxNumProcessors = omp_get_max_threads();
std::cout << "Maximum number of CPU threads = " << maxNumProcessors << std::endl;

omp_set_num_threads(numGPUs);
#pragma omp parallel
{
   unsigned int k = omp_get_thread_num();
   gpuErrchk(cudaSetDevice(k));
   cudaEventCreateWithFlags(&start[k], cudaEventBlockingSync);
   cudaEventCreateWithFlags(&stop[k], cudaEventBlockingSync);
   cudaEventRecord(start[k], 0);
   kernelFunction<<<iDivUp(NperGPU, BLOCKSIZE), BLOCKSIZE>>>(plan[k].d_data, NperGPU);
   cudaEventRecord(stop[k], 0);
   cudaEventSynchronize(stop[k]);
   cudaEventElapsedTime(&time[k], start[k], stop[k]);

   printf("Thread nr. %i; Elapsed time:  %3.1f ms \n", k, time[k]);
}
4

Profiler timeline

Leave a Reply

Your email address will not be published. Required fields are marked *