What I am doing is: I allocated and initialized two arrays (A
, B
) in global memory using cudaMalloc()
and cudaMemset()
and called a kernel with A
. The kernel did not do anything with array B
, but when I checked B
from right after the kernel executed against B
from before the kernel is called, there was a huge difference. What happened here?
__global__ void BlockSort(int *A)
{
//A is an array in global memory. Sort A using BlockRadixSort
using BlockRadixSort = cub::BlockRadixSort<int, 256, 8>;
__shared__ typename BlockRadixSort::TempStorage temp_storage;
int threadKeys[8];
int seg_idx = gridDim.y * blockIdx.y + blockIdx.x,
seg_start = RATIO * seg_idx;
for (int i = 0; i < 8; ++i)
threadKeys[i] = A[seg_start +
8 * (blockDim.y * threadIdx.y + blockIdx.x)];
BlockRadixSort(temp_storage).Sort(threadKeys);
for (int i = 0; i < 8; ++i)
A[seg_start + 8 * (blockDim.y * threadIdx.y + blockIdx.x)]
= threadKeys[i];
}
int* B1;
B1 = (int*)malloc(sizeof(int) * sz);
cudaMemcpy(B1, B, sizeof(int) * sz, cudaMemcpyDeviceToHost);
BlockSort<<<gridDim, blockDim>>>(A); // launch BlockSort with A. Nothing is done to B
cudaDeviceSynchronize(); // do I really need to synchronize here?
int* B2;
B2 = (int*)malloc(sizeof(int) * sz);
cudaMemcpy(B2, B, sizeof(int) * sz, cudaMemcpyDeviceToHost);
for (int i = 0; i < sz; ++i)
{
if (B1[i] != B2[i])
printf("at i = %d, B1[%d] = %d, B2[%d] = %d\n",i, i, B1[i], i, B2[i]);
}
The print messages show B1
and B2
are different at many different indices i
.
If I comment out the line launching my BlockSort()
kernel, then B1
is same as B2
.
Honestly at a loss here. Any help will be appreciated!