Questions
I have performance trouble when coding CUDA using shared memory to implement matrix multiply. The problem is that the performance (running duration) is low.
# Naive matrix multiply
---------- Timing ----------
cudaMalloc: 199319 us
cudaMemcpy H2D: 125567 us
kernel: 295 us
cudaMemcpy D2H: 801516 us
Total: 1126699 us
----------------------------
# Matrix multiply with shared memory
---------- Timing ----------
cudaMalloc: 194859 us
cudaMemcpy H2D: 127967 us
kernel: 49970 us
cudaMemcpy D2H: 648310 us
Total: 1021107 us
----------------------------
As you can see, the kernel performance of navie multiply is better than who uses shared memory (The official solution is similar to mine).
My question:
- Is the shared memory really works there? It seems that the data in shared memory is refreshed in next loop, no data is re-used.
My Code
In order to show my timing strategy, I paste all code here.
Definition
#include <algorithm>
#include <chrono>
#include <cstdio>
#define cudaErrorCheck(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal Error: %s (%s at %s:%d)\n", msg, cudaGetErrorString(__err), __FILE__, __LINE__); \
fprintf(stderr, "***** FAILED ABORTING\n"); \
exit(1); \
} \
} while (0)
// Size per dimension
constexpr int DATA_SIZE = 8192;
constexpr int BLOCK_SIZE = 32;
constexpr int TILE_SIZE = 32;
constexpr float INIT_VAL_A = 3.0f;
constexpr float INIT_VAL_B = 2.0f;
/**
* The thread num equals to the data_size = DATA_SIZE * DATA_SIZE,
* each thread calculate one element in matrix C.
*/
__global__ void mm_naive(const float *A, const float *B, float *C, const int size) {
const int x = threadIdx.x + blockIdx.x * blockDim.x;
const int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < size && y < size) {
float accum = 0.0f;
for (int i = 0; i < size; i += 1) {
accum += A[i + y * size] * B[x + i * size];
}
C[x + y * size] = accum;
}
}
/**
* Matrix multiply with shared mem
*/
__global__ void mm_shared(const float *A, const float *B, float *C, const int size) {
__shared__ float As[TILE_SIZE][TILE_SIZE];
__shared__ float Bs[TILE_SIZE][TILE_SIZE];
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < size && y < size) {
float accum = 0.0f;
for (int i = 0; i < (size + TILE_SIZE - 1) / TILE_SIZE; i += 1) {
As[threadIdx.y][threadIdx.x] = A[y * size + (i * TILE_SIZE + threadIdx.x)];
Bs[threadIdx.y][threadIdx.x] = B[(i * TILE_SIZE + threadIdx.y) * size + x];
__syncthreads();
for (int j = 0; j < TILE_SIZE; j += 1) {
accum += As[threadIdx.y][j] * Bs[j][threadIdx.x];
}
__syncthreads();
}
C[y * size + x] = accum;
}
}
int main() {
float *A_h = new float[DATA_SIZE * DATA_SIZE];
float *B_h = new float[DATA_SIZE * DATA_SIZE];
float *C_h = new float[DATA_SIZE * DATA_SIZE];
std::fill_n(A_h, DATA_SIZE * DATA_SIZE, INIT_VAL_A);
std::fill_n(B_h, DATA_SIZE * DATA_SIZE, INIT_VAL_B);
auto t0 = std::chrono::system_clock::now();
float *A_d = nullptr;
float *B_d = nullptr;
float *C_d = nullptr;
cudaMalloc(&A_d, DATA_SIZE * DATA_SIZE * sizeof(float));
cudaMalloc(&B_d, DATA_SIZE * DATA_SIZE * sizeof(float));
cudaMalloc(&C_d, DATA_SIZE * DATA_SIZE * sizeof(float));
auto t1 = std::chrono::system_clock::now();
cudaMemcpy(A_d, A_h, DATA_SIZE * DATA_SIZE * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B_h, DATA_SIZE * DATA_SIZE * sizeof(float), cudaMemcpyHostToDevice);
auto t2 = std::chrono::system_clock::now();
// Loading the kernel
dim3 block_dim(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid_dim((DATA_SIZE + block_dim.x - 1) / block_dim.x, (DATA_SIZE + block_dim.y - 1) / block_dim.y);
mm_shared<<<grid_dim, block_dim>>>(A_d, B_d, C_d, DATA_SIZE);
auto t3 = std::chrono::system_clock::now();
cudaMemcpy(C_h, C_d, DATA_SIZE * DATA_SIZE * sizeof(float), cudaMemcpyDeviceToHost);
auto t4 = std::chrono::system_clock::now();
printf("---------- Timing ----------\n");
printf("cudaMalloc: %ld us\n", std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count());
printf("cudaMemcpy H2D: %ld us\n", std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count());
printf("kernel: %ld us\n", std::chrono::duration_cast<std::chrono::microseconds>(t3 - t2).count());
printf("cudaMemcpy D2H: %ld us\n", std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3).count());
printf("Total: %ld us\n", std::chrono::duration_cast<std::chrono::microseconds>(t4 - t0).count());
printf("----------------------------\n");
return 0;
}
I use Ubuntu20.04 RTX TITAN with CUDA 12.2 in my platform.
I have copy the official code and get the similar performance, so maybe the shared memory there is no use.
Thanks for reminding me to add my compile command, which is so simple:
nvcc matrix_mul.cu -o matrix_mul
./matrix_mul