I am capturing some work on a CUDA stream, then instantiating the resulting graph (or graph template rather) and running it again. I am running CUDA 12.6.85 with driver version 535.54.03 (the driver is from CUDA 12.2), and am getting an error:
- If I try to end the stream capture after scheduling all operations of interest, using
cuStreamEndCapture()
, this fails, with theCUDA_ERROR_STREAM_CAPTURE_UNJOINED
error. - If, instead, try to synchronize the stream before ending the capture (
cuStreamSynchronize()
), I getCUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
.
I think this used to work for me, on a differeny system, with different CUDA and driver versions, so I suspect that may be the matter, but am not sure. What could be the cause of this happening?
I'll provide the code here, but note that it uses the CUDA API wrappers (so in principle you could say "It's a problem with your own library").
void cudaGraphsUsingStreamCapture(
const cuda::device_t& device,
span<float> inputVec_h,
span<float> inputVec_d,
span<double> outputVec_d,
span<double> result_d)
{
const char* graph_construction_mode = "stream capture";
report_attempt("construction", graph_construction_mode);
double result_h = 0.0;
using cuda::stream::async;
auto stream_1 = cuda::stream::create(device, async);
auto stream_2 = cuda::stream::create(device, async);
auto stream_3 = cuda::stream::create(device, async);
auto fork_stream_event = cuda::event::create(device);
auto reduce_output_memset_event = cuda::event::create(device);
auto final_result_memset_event = cuda::event::create(device);
stream_1.begin_capture();
stream_1.enqueue.event(fork_stream_event);
stream_2.enqueue.wait(fork_stream_event);
stream_3.enqueue.wait(fork_stream_event);
stream_1.enqueue.copy(inputVec_d, inputVec_h);
stream_2.enqueue.memzero(outputVec_d);
stream_2.enqueue.event();
stream_3.enqueue.memzero(result_d);
stream_3.enqueue.event(final_result_memset_event);
stream_1.enqueue.wait(reduce_output_memset_event);
auto launch_config = cuda::launch_config_builder()
.grid_size(outputVec_d.size())
.block_size(THREADS_PER_BLOCK)
.build();
stream_1.enqueue.kernel_launch(reduce, launch_config,
inputVec_d.data(), outputVec_d.data(), inputVec_d.size(), outputVec_d.size());
stream_1.enqueue.wait(final_result_memset_event);
launch_config = cuda::launch_config_builder()
.grid_dimensions(1)
.block_dimensions(THREADS_PER_BLOCK)
.build();
stream_1.enqueue.kernel_launch(reduceFinal, launch_config,
outputVec_d.data(), result_d.data(), outputVec_d.size());
stream_1.enqueue.copy(&result_h, result_d);
auto callback = [&]() { myRealHostNodeCallback(graph_construction_mode, result_h); };
stream_1.enqueue.host_invokable(callback);
// If we add this:
//
// stream_1.synchronize();
//
// the program fails still, but with a different error.
auto graph = stream_1.end_capture();
use(device, graph, graph_construction_mode);
}
I am capturing some work on a CUDA stream, then instantiating the resulting graph (or graph template rather) and running it again. I am running CUDA 12.6.85 with driver version 535.54.03 (the driver is from CUDA 12.2), and am getting an error:
- If I try to end the stream capture after scheduling all operations of interest, using
cuStreamEndCapture()
, this fails, with theCUDA_ERROR_STREAM_CAPTURE_UNJOINED
error. - If, instead, try to synchronize the stream before ending the capture (
cuStreamSynchronize()
), I getCUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
.
I think this used to work for me, on a differeny system, with different CUDA and driver versions, so I suspect that may be the matter, but am not sure. What could be the cause of this happening?
I'll provide the code here, but note that it uses the CUDA API wrappers (so in principle you could say "It's a problem with your own library").
void cudaGraphsUsingStreamCapture(
const cuda::device_t& device,
span<float> inputVec_h,
span<float> inputVec_d,
span<double> outputVec_d,
span<double> result_d)
{
const char* graph_construction_mode = "stream capture";
report_attempt("construction", graph_construction_mode);
double result_h = 0.0;
using cuda::stream::async;
auto stream_1 = cuda::stream::create(device, async);
auto stream_2 = cuda::stream::create(device, async);
auto stream_3 = cuda::stream::create(device, async);
auto fork_stream_event = cuda::event::create(device);
auto reduce_output_memset_event = cuda::event::create(device);
auto final_result_memset_event = cuda::event::create(device);
stream_1.begin_capture();
stream_1.enqueue.event(fork_stream_event);
stream_2.enqueue.wait(fork_stream_event);
stream_3.enqueue.wait(fork_stream_event);
stream_1.enqueue.copy(inputVec_d, inputVec_h);
stream_2.enqueue.memzero(outputVec_d);
stream_2.enqueue.event();
stream_3.enqueue.memzero(result_d);
stream_3.enqueue.event(final_result_memset_event);
stream_1.enqueue.wait(reduce_output_memset_event);
auto launch_config = cuda::launch_config_builder()
.grid_size(outputVec_d.size())
.block_size(THREADS_PER_BLOCK)
.build();
stream_1.enqueue.kernel_launch(reduce, launch_config,
inputVec_d.data(), outputVec_d.data(), inputVec_d.size(), outputVec_d.size());
stream_1.enqueue.wait(final_result_memset_event);
launch_config = cuda::launch_config_builder()
.grid_dimensions(1)
.block_dimensions(THREADS_PER_BLOCK)
.build();
stream_1.enqueue.kernel_launch(reduceFinal, launch_config,
outputVec_d.data(), result_d.data(), outputVec_d.size());
stream_1.enqueue.copy(&result_h, result_d);
auto callback = [&]() { myRealHostNodeCallback(graph_construction_mode, result_h); };
stream_1.enqueue.host_invokable(callback);
// If we add this:
//
// stream_1.synchronize();
//
// the program fails still, but with a different error.
auto graph = stream_1.end_capture();
use(device, graph, graph_construction_mode);
}
Share
Improve this question
asked Mar 9 at 15:21
einpoklumeinpoklum
133k80 gold badges421 silver badges864 bronze badges
1 Answer
Reset to default 0tl;dr: You recorded an event but did not have any stream wait for it.
Your problem is with this line:
stream_2.enqueue.event();
it enqueues an event which stream_1 does not later wait for. That means that stream_2, which got involved in the graph by waiting for a captured event, was not "joined" back to stream_1 before the capture ended. This is what CUDA (or the NVIDIA driver) is complaining about, with the CUDA_ERROR_STREAM_CAPTURE_UNJOINED
error.
You probably wanted to write:
stream_2.enqueue.event(reduce_output_memset_event);
as this line comes right after this one:
stream_2.enqueue.memzero(outputVec_d);
this way, you're waiting for the memzero()
operation to complete before using the output vector, plus you're "joining" stream_2 back to stream_1, so that the graph (template) has both its source and sink vertices on stream_1.