nvcc not recognizing a kernel on Windows 11 with cuda 12.5 (I can compile other *.cu file fine with powershell)
nvcc -arch=sm_89 .\simplest_kernel.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
__global__ void kernel(uint *A, uint *B, int row) {
auto x = threadIdx.x / 4;
auto y = threadIdx.x % 4;
A[x * row + y] = x;
B[x * row + y] = y;
}
int main(int argc, char **argv) {
uint *Xs, *Ys;
uint *Xs_d, *Ys_d;
uint SIZE = 4;
Xs = (uint *)malloc(SIZE * SIZE * sizeof(uint));
Ys = (uint *)malloc(SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Xs_d, SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Ys_d, SIZE * SIZE * sizeof(uint));
dim3 grid_size(1, 1, 1);
dim3 block_size(4 * 4);
kernel<<<grid_size, block_size>>>(Xs_d, Ys_d, 4);
cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int row = 0; row < SIZE; ++row) {
for (int col = 0; col < SIZE; ++col) {
std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
<< "] ";
}
std::cout << "\n";
}
cudaFree(Xs_d);
cudaFree(Ys_d);
free(Xs);
free(Ys);
}
results in
PS D:\samples\api\SGEMM_CUDA> nvcc -arch=sm_89 .\simplest_kernel.cu
simplest_kernel.cu
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: attribute "__global__" does not apply here
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: incomplete type "void" is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "uint" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "A" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "B" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: type name is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ")"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ";"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
nvcc not recognizing a kernel on Windows 11 with cuda 12.5 (I can compile other *.cu file fine with powershell)
nvcc -arch=sm_89 .\simplest_kernel.cu
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
__global__ void kernel(uint *A, uint *B, int row) {
auto x = threadIdx.x / 4;
auto y = threadIdx.x % 4;
A[x * row + y] = x;
B[x * row + y] = y;
}
int main(int argc, char **argv) {
uint *Xs, *Ys;
uint *Xs_d, *Ys_d;
uint SIZE = 4;
Xs = (uint *)malloc(SIZE * SIZE * sizeof(uint));
Ys = (uint *)malloc(SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Xs_d, SIZE * SIZE * sizeof(uint));
cudaMalloc((void **)&Ys_d, SIZE * SIZE * sizeof(uint));
dim3 grid_size(1, 1, 1);
dim3 block_size(4 * 4);
kernel<<<grid_size, block_size>>>(Xs_d, Ys_d, 4);
cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int row = 0; row < SIZE; ++row) {
for (int col = 0; col < SIZE; ++col) {
std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
<< "] ";
}
std::cout << "\n";
}
cudaFree(Xs_d);
cudaFree(Ys_d);
free(Xs);
free(Ys);
}
results in
PS D:\samples\api\SGEMM_CUDA> nvcc -arch=sm_89 .\simplest_kernel.cu
simplest_kernel.cu
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: attribute "__global__" does not apply here
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: incomplete type "void" is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "uint" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "A" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: identifier "B" is undefined
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: type name is not allowed
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ")"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
^
D:\samples\api\SGEMM_CUDA\simplest_kernel.cu(5): error: expected a ";"
__declspec(__global__) void kernel(uint *A, uint *B, int row) {
Share
Improve this question
asked Nov 18, 2024 at 13:40
He HuangHe Huang
231 silver badge3 bronze badges
3
|
1 Answer
Reset to default 1From your error message I can see that you are running on Windows.
Only on Linux is uint
defined, on Windows it is not.
You can fix this easily, by using unsigned
instead, or just define uint
as needed. The following fix will make your code compile and run correctly.
Add thus define uint as needed (note that on modern platforms unsigned
means unsigned int
, aka uint32_t
):
#ifndef uint
typedef unsigned uint;
#endif
This problem manifests as it does, because the nvcc preprocessor cannot translate the __global__
, __device__
etc directives in code that with faulty syntax. If there are (too many) syntax errors, then the nvcc system will just feed the code into the c++ compiler as is without processing the CUDA specific extensions. That compiler will then fail on those extensions, resulting in the errors that you see.
In general when you see a lot of errors relating to CUDA stuff the odds are high that you have a simple syntax error somewhere tripping up the nvcc preprocessor.
The full code (see below) now compiles and runs correctly, outputting the correct result.
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#ifndef uint
typedef unsigned uint;
#endif
__global__ void kernel(uint* A, uint* B, int row) {
auto x = threadIdx.x / 4;
auto y = threadIdx.x % 4;
A[x * row + y] = x;
B[x * row + y] = y;
}
int main(int argc, char** argv) {
uint* Xs, * Ys;
uint* Xs_d, * Ys_d;
uint SIZE = 4;
Xs = (uint*)malloc(SIZE * SIZE * sizeof(uint));
Ys = (uint*)malloc(SIZE * SIZE * sizeof(uint));
cudaMalloc((void**)&Xs_d, SIZE * SIZE * sizeof(uint));
cudaMalloc((void**)&Ys_d, SIZE * SIZE * sizeof(uint));
dim3 grid_size(1, 1, 1);
dim3 block_size(4 * 4);
kernel << <grid_size, block_size >> > (Xs_d, Ys_d, 4);
cudaMemcpy(Xs, Xs_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaMemcpy(Ys, Ys_d, SIZE * SIZE * sizeof(uint), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
for (int row = 0; row < SIZE; ++row) {
for (int col = 0; col < SIZE; ++col) {
std::cout << "[" << Xs[row * SIZE + col] << "|" << Ys[row * SIZE + col]
<< "] ";
}
std::cout << "\n";
}
cudaFree(Xs_d);
cudaFree(Ys_d);
free(Xs);
free(Ys);
}
uint
looks suspect, try and replace it withunsigned
oruint32_t
– Johan Commented Nov 19, 2024 at 7:55