** **The CUDA SDK offer a tiled matrix-matrix multiplication example using the shared memory.

However, it is limited to the case when the matrix dimensions are multiples of the tile dimension.

When, on the contrary, the matrix dimensions are not-multiples of the tile dimensions, then some tiles will only partially overlap the matrices.

The elements of the tiles partially overlapping the matrices should be properly zeroed.

Below, a kernel function for the general case of tiled matrix-matrix multiplication between arbitrarily sized matrices using the shared memory.

__global__ void MatMul(float* A, float* B, float* C, int ARows, int ACols, int BRows, int BCols, int CRows, int CCols) { float CValue = 0; int Row = blockIdx.y*TILE_DIM + threadIdx.y; int Col = blockIdx.x*TILE_DIM + threadIdx.x; __shared__ float As[TILE_DIM][TILE_DIM]; __shared__ float Bs[TILE_DIM][TILE_DIM]; for (int k = 0; k < (TILE_DIM + ACols - 1)/TILE_DIM; k++) { if (k*TILE_DIM + threadIdx.x < ACols && Row < ARows) As[threadIdx.y][threadIdx.x] = A[Row*ACols + k*TILE_DIM + threadIdx.x]; else As[threadIdx.y][threadIdx.x] = 0.0; if (k*TILE_DIM + threadIdx.y < BRows && Col < BCols) Bs[threadIdx.y][threadIdx.x] = B[(k*TILE_DIM + threadIdx.y)*BCols + Col]; else Bs[threadIdx.y][threadIdx.x] = 0.0; __syncthreads(); for (int n = 0; n < TILE_DIM; ++n) CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x]; __syncthreads(); } if (Row < CRows && Col < CCols) C[((blockIdx.y * blockDim.y + threadIdx.y)*CCols)+(blockIdx.x*blockDim.x)+threadIdx.x]=CValue; }

The following version of the kernel is “equivalent” to that above, but does not use shared memory.

__global__ void MatMulNoShared(float* A, float* B, float* C, int ARows, int ACols, int BRows, int BCols, int CRows, int CCols) { float CValue = 0; int Row = blockIdx.y*TILE_DIM + threadIdx.y; int Col = blockIdx.x*TILE_DIM + threadIdx.x; for (int k = 0; k < (TILE_DIM + ACols - 1)/TILE_DIM; k++) { for (int n = 0; n < TILE_DIM; ++n) if ((k*TILE_DIM + n < ACols && Row < ARows) && (k*TILE_DIM + n < BRows && Col < BCols)) CValue += A[Row*ACols + k*TILE_DIM + n] * B[(k*TILE_DIM + n)*BCols + Col]; } if (Row < CRows && Col < CCols) C[((blockIdx.y * blockDim.y + threadIdx.y)*CCols)+(blockIdx.x*blockDim.x)+threadIdx.x]=CValue; }

The full Visual Studio 2010 project is downloadable here.