CUDA – 1D Linear Interpolation with Global Memory, Texture Memory or Texture Filtering

M is the length of input sampling points assumed to be taken with unit sampling step between -M/2 and M/2 and N is the length of the output sampling points, non-uniformly spaced again in the interval -M/2 … M/2.

Version using Global Memory
In a “Kernels_Interpolation.cu” file

extern "C" void linear_interpolation_function_GPU(float2* result_d, float2* data_d, float* x_in_d, float* x_out_d, int M, int N){
   float* result_d_temp = (float*)result_d;
   float* data_d_temp = (float*)data_d;
   dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid((2*N)/BLOCK_SIZE + ((2*N)%BLOCK_SIZE == 0 ? 0:1),1);
   linear_interpolation_kernel_function_GPU<<<dimGrid,dimBlock>>>(result_d_temp, data_d_temp, x_out_d, M, 2*N);
}

In a “Kernels_Interpolation.cuh” file

__global__ void linear_interpolation_kernel_function_GPU(float* __restrict__ result_d, const float* __restrict__ data_d, const float* __restrict__ x_out_d, const int M, const int N)
{
   int j = threadIdx.x + blockDim.x * blockIdx.x;
   if(j<N)
   {
      float reg_x_out = x_out_d[j/2]+M/2;
      float t = truncf(reg_x_out);
      int k = (int)t;
      float a = reg_x_out - t;
      float dk = data_d[2*k+(j&1)];
      float dkp1 = data_d[2*k+2+(j&1)];
      result_d[j] = a * dkp1 + (-dk * a + dk);
    }
}

Version using Texture Memory
In a “Kernels_Interpolation.cu” file

extern "C" void linear_interpolation_function_GPU_texture(float2* result_d, float2* data_d, float* x_in_d, float* x_out_d, int M, int N){
   float* result_d_temp = (float*)result_d;
   float* data_d_temp = (float*)data_d;
   cudaBindTexture(NULL, data_d_GPU_texture, data_d_temp, 2*M*sizeof(float));
   dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid((2*N)/BLOCK_SIZE + ((2*N)%BLOCK_SIZE == 0 ? 0:1),1);
   linear_interpolation_kernel_function_GPU_texture<<<dimGrid,dimBlock>>>(result_d_temp, x_out_d, 2*M, 2*N);
}

In a “Kernels_Interpolation.cuh” file

texture<float,1> data_d_GPU_texture;

__global__ void linear_interpolation_kernel_function_GPU_texture(float* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N)
{
   int j = threadIdx.x + blockDim.x * blockIdx.x;
   if(j<N)
   {
      float reg_x_out = x_out_d[j/2]+M/4;
      float t = truncf(reg_x_out);
      int k = (int)t;
      float a = reg_x_out - t;
      float dk = tex1Dfetch(data_d_GPU_texture,2*k+(j&1));
      float dkp1 = tex1Dfetch(data_d_GPU_texture,2*k+2+(j&1));
      result_d[j] = a * dkp1 + (-dk * a + dk);
   }
}

Version using Texture Filtering
In a “Kernels_Interpolation.cu” file

extern "C" void linear_interpolation_function_GPU_texture_filtering(float2* result_d, float2* data, float* x_in_d, float* x_out_d, int M, int N){
   cudaArray* data_d = NULL; cudaMallocArray (&data_d, &data_d_texture.channelDesc, M, 1);
   cudaMemcpyToArray(data_d, 0, 0, data, sizeof(float2)*M, cudaMemcpyHostToDevice);
   cudaBindTextureToArray(data_d_texture, data_d);
   data_d_texture.normalized = false;
   data_d_texture.filterMode = cudaFilterModeLinear;
   dim3 dimBlock(BLOCK_SIZE,1); dim3 dimGrid(N/BLOCK_SIZE + (N%BLOCK_SIZE == 0 ? 0:1),1);
   linear_interpolation_kernel_function_GPU_texture_filtering<<<dimGrid,dimBlock>>>(result_d, x_out_d, M, N);
}

In a “Kernels_Interpolation.cuh” file

texture<float2, 1, cudaReadModeElementType> data_d_texture;

__global__ void linear_interpolation_kernel_function_GPU_texture_filtering(float2* __restrict__ result_d, const float* __restrict__ x_out_d, const int M, const int N)
{
   int j = threadIdx.x + blockDim.x * blockIdx.x;
   if(j<N) result_d[j] = tex1D(data_d_texture,float(x_out_d[j]+M/2+0.5));
}

For comments regarding the relative speed and accuracy of the three solutions, please see Nvidia Forum.

 

 

Leave a Reply

Your email address will not be published. Required fields are marked *