I want to implement CDP for a basic forward function (I will call the forward function too many times at the same time (also from a CUDA function) and because of that I want to use CDP)
Here's the code that I'm trying to run;
__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int outputidx = idx / outputsize;
int inputidx = idx % outputsize;
if (outputidx >= outputsize || inputidx >= inputsize) {
return;
}
atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}
__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= size) {
return;
}
result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}
__global__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
cudaDeviceSynchronize();
NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}
I also tried to run the function from a device function like this but still gave me the same error;
__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int outputidx = idx / outputsize;
int inputidx = idx % outputsize;
if (outputidx >= outputsize || inputidx >= inputsize) {
return;
}
atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}
__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= size) {
return;
}
result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}
__device__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}
__global__ void NNFeedForwardNormalWrapper(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
NNFeedForwardNormal(values, weigths, result, biases, inputsize, outputsize);
}
And also tried cudaLaunchKernel
function and using __global__
instead of __device__
but they didn't work either. I'm using -rdc=true
flag too and also my arch is sm_75
which should support CDP.
I want to implement CDP for a basic forward function (I will call the forward function too many times at the same time (also from a CUDA function) and because of that I want to use CDP)
Here's the code that I'm trying to run;
__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int outputidx = idx / outputsize;
int inputidx = idx % outputsize;
if (outputidx >= outputsize || inputidx >= inputsize) {
return;
}
atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}
__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= size) {
return;
}
result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}
__global__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
cudaDeviceSynchronize();
NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}
I also tried to run the function from a device function like this but still gave me the same error;
__device__ void NNFeedForwardNormalMultiple(double* __restrict__ values, double* __restrict__ weigths, double* result, int inputsize, int outputsize) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int outputidx = idx / outputsize;
int inputidx = idx % outputsize;
if (outputidx >= outputsize || inputidx >= inputsize) {
return;
}
atomicAdd(&result[outputidx], values[inputidx] * weigths[outputsize*outputidx + inputidx]);
}
__device__ void NNFeedForwardNormalActivate(double* __restrict__ biases, double* result, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= size) {
return;
}
result[idx] = 1.0 / (1.0 + exp(-(result[idx] + biases[idx])));
}
__device__ void NNFeedForwardNormal(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
int blocksize = (inputsize * outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
NNFeedForwardNormalMultiple<<<blocksize, THREADS_PER_BLOCK>>>(values, weigths, result, inputsize, outputsize);
NNFeedForwardNormalActivate<<<(outputsize + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(biases, result, outputsize);
}
__global__ void NNFeedForwardNormalWrapper(double* __restrict__ values, double* __restrict__ weigths, double* result, double* __restrict__ biases, int inputsize, int outputsize) {
NNFeedForwardNormal(values, weigths, result, biases, inputsize, outputsize);
}
And also tried cudaLaunchKernel
function and using __global__
instead of __device__
but they didn't work either. I'm using -rdc=true
flag too and also my arch is sm_75
which should support CDP.
1 Answer
Reset to default 1The error is exactly what is says on the tin.
If you want to configure a call to a function with <<<x,y,z>>>
parameters, it needs to be a __global__
function.
It is perfectly valid to call a __global__
function from another global function.
if you want your code to compile, recode it like so:
#define restrict __restrict__
__global__ void NNFeedForwardNormalMultiple(double* restrict values, double* restrict weights, double* result, int inputsize, int outputsize) {
...
}
__global__ void NNFeedForwardNormalActivate(double* restrict biases, double* result, int size) {
...
}
__global__ void NNFeedForwardNormal(double* restrict values, double* restrict weights, double* result, double* restrict biases, int inputsize, int outputsize) {
...
}
__global__ void NNFeedForwardNormalWrapper(double* restrict values, double* restrict weights, double* result, double* restrict biases, int inputsize, int outputsize) {
const auto blocks = some calculation;
const auto threads_per_block = some calculation;
NNFeedForwardNormal<<<blocks, threads_per_block>>>(values, weights, result, biases, inputsize, outputsize);
}
If instead you want to limit the number of threads in a device function, you can do it like below. Note that you can only limit the number of blocks/threads, you cannot expand it.
As long as you make sure to keep the bounds at warp edges, this will incur no slowdown.
__device__ void first_ten_blocks() { ... }
__device__ void other_blocks() { ... }
__global__ void start() {
if (blockIdx.x < 10) { first_ten_blocks(); }
else { other_blocks(); }
}
__device__ void first_warp() { ... }
__device__ void other_warps() { ... }
__global__ void warp_split() {
if (threadIdx.x < 32) { first_warp(); }
else { other_warps(); }
}
int main() {
start<<<100,32>>>(); //10 block first_ten_block, 90 blocks other blocks
start<<<9,32>>>(); //9 blocks first_ten_blocks, no other blocks
warp_split<<<1, 64>>>(); 1 warp in first_warp, 1 in other_warps
warp_split<<<10, 32>>>(); 10x warp in first_warp, no other warps
warp_split<<<48, 512>>>(); 48x first_warp, 48x other warps with 512-32 = 480 threads each.
}
__global__
would be more interesting. – paleonix Commented Mar 6 at 12:02cudaDeviceSynchronize()
in device code will not run on recent GPUs. It was replaced with CDP2 which does not allow using results from child kernels in the parent kernel. – paleonix Commented Mar 6 at 14:41