cuda - Trying to end stream capture fails due to "unjoined work"; but synchronizing fails when capture is in p

I am capturing some work on a CUDA stream, then instantiating the resulting graph (or graph template rather) and running it again. I am running CUDA 12.6.85 with driver version 535.54.03 (the driver is from CUDA 12.2), and am getting an error:

If I try to end the stream capture after scheduling all operations of interest, using cuStreamEndCapture(), this fails, with the CUDA_ERROR_STREAM_CAPTURE_UNJOINED error.
If, instead, try to synchronize the stream before ending the capture (cuStreamSynchronize()), I get CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED.

I think this used to work for me, on a differeny system, with different CUDA and driver versions, so I suspect that may be the matter, but am not sure. What could be the cause of this happening?

I'll provide the code here, but note that it uses the CUDA API wrappers (so in principle you could say "It's a problem with your own library").

void cudaGraphsUsingStreamCapture(
    const cuda::device_t& device,
    span<float>  inputVec_h,
    span<float>  inputVec_d,
    span<double> outputVec_d,
    span<double> result_d)
{
    const char* graph_construction_mode = "stream capture";
    report_attempt("construction", graph_construction_mode);
    double result_h = 0.0;

    using cuda::stream::async;
    auto stream_1 = cuda::stream::create(device, async);
    auto stream_2 = cuda::stream::create(device, async);
    auto stream_3 = cuda::stream::create(device, async);

    auto fork_stream_event = cuda::event::create(device);
    auto reduce_output_memset_event = cuda::event::create(device);
    auto final_result_memset_event = cuda::event::create(device);

    stream_1.begin_capture();

    stream_1.enqueue.event(fork_stream_event);
    stream_2.enqueue.wait(fork_stream_event);
    stream_3.enqueue.wait(fork_stream_event);

    stream_1.enqueue.copy(inputVec_d, inputVec_h);
    stream_2.enqueue.memzero(outputVec_d);

    stream_2.enqueue.event();
    stream_3.enqueue.memzero(result_d);
    stream_3.enqueue.event(final_result_memset_event);

    stream_1.enqueue.wait(reduce_output_memset_event);

    auto launch_config = cuda::launch_config_builder()
        .grid_size(outputVec_d.size())
        .block_size(THREADS_PER_BLOCK)
        .build();

    stream_1.enqueue.kernel_launch(reduce, launch_config,
        inputVec_d.data(), outputVec_d.data(), inputVec_d.size(), outputVec_d.size());

    stream_1.enqueue.wait(final_result_memset_event);

    launch_config =  cuda::launch_config_builder()
        .grid_dimensions(1)
        .block_dimensions(THREADS_PER_BLOCK)
        .build();
    stream_1.enqueue.kernel_launch(reduceFinal, launch_config,
        outputVec_d.data(), result_d.data(), outputVec_d.size());

    stream_1.enqueue.copy(&result_h, result_d);

    auto callback = [&]() { myRealHostNodeCallback(graph_construction_mode, result_h); };
    stream_1.enqueue.host_invokable(callback);

    // If we add this:
    //
    // stream_1.synchronize();
    //
    // the program fails still, but with a different error.

    auto graph = stream_1.end_capture();
    use(device, graph, graph_construction_mode);
}

If I try to end the stream capture after scheduling all operations of interest, using cuStreamEndCapture(), this fails, with the CUDA_ERROR_STREAM_CAPTURE_UNJOINED error.
If, instead, try to synchronize the stream before ending the capture (cuStreamSynchronize()), I get CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED.

I think this used to work for me, on a differeny system, with different CUDA and driver versions, so I suspect that may be the matter, but am not sure. What could be the cause of this happening?

I'll provide the code here, but note that it uses the CUDA API wrappers (so in principle you could say "It's a problem with your own library").

void cudaGraphsUsingStreamCapture(
    const cuda::device_t& device,
    span<float>  inputVec_h,
    span<float>  inputVec_d,
    span<double> outputVec_d,
    span<double> result_d)
{
    const char* graph_construction_mode = "stream capture";
    report_attempt("construction", graph_construction_mode);
    double result_h = 0.0;

    using cuda::stream::async;
    auto stream_1 = cuda::stream::create(device, async);
    auto stream_2 = cuda::stream::create(device, async);
    auto stream_3 = cuda::stream::create(device, async);

    auto fork_stream_event = cuda::event::create(device);
    auto reduce_output_memset_event = cuda::event::create(device);
    auto final_result_memset_event = cuda::event::create(device);

    stream_1.begin_capture();

    stream_1.enqueue.event(fork_stream_event);
    stream_2.enqueue.wait(fork_stream_event);
    stream_3.enqueue.wait(fork_stream_event);

    stream_1.enqueue.copy(inputVec_d, inputVec_h);
    stream_2.enqueue.memzero(outputVec_d);

    stream_2.enqueue.event();
    stream_3.enqueue.memzero(result_d);
    stream_3.enqueue.event(final_result_memset_event);

    stream_1.enqueue.wait(reduce_output_memset_event);

    auto launch_config = cuda::launch_config_builder()
        .grid_size(outputVec_d.size())
        .block_size(THREADS_PER_BLOCK)
        .build();

    stream_1.enqueue.kernel_launch(reduce, launch_config,
        inputVec_d.data(), outputVec_d.data(), inputVec_d.size(), outputVec_d.size());

    stream_1.enqueue.wait(final_result_memset_event);

    launch_config =  cuda::launch_config_builder()
        .grid_dimensions(1)
        .block_dimensions(THREADS_PER_BLOCK)
        .build();
    stream_1.enqueue.kernel_launch(reduceFinal, launch_config,
        outputVec_d.data(), result_d.data(), outputVec_d.size());

    stream_1.enqueue.copy(&result_h, result_d);

    auto callback = [&]() { myRealHostNodeCallback(graph_construction_mode, result_h); };
    stream_1.enqueue.host_invokable(callback);

    // If we add this:
    //
    // stream_1.synchronize();
    //
    // the program fails still, but with a different error.

    auto graph = stream_1.end_capture();
    use(device, graph, graph_construction_mode);
}

Share Improve this question asked Mar 9 at 15:21 einpoklum 133k80 gold badges421 silver badges864 bronze badges

Add a comment |

1 Answer 1

Sorted by: Reset to default 0

tl;dr: You recorded an event but did not have any stream wait for it.

Your problem is with this line:

stream_2.enqueue.event();

it enqueues an event which stream_1 does not later wait for. That means that stream_2, which got involved in the graph by waiting for a captured event, was not "joined" back to stream_1 before the capture ended. This is what CUDA (or the NVIDIA driver) is complaining about, with the CUDA_ERROR_STREAM_CAPTURE_UNJOINED error.

You probably wanted to write:

stream_2.enqueue.event(reduce_output_memset_event);

as this line comes right after this one:

stream_2.enqueue.memzero(outputVec_d);

this way, you're waiting for the memzero() operation to complete before using the output vector, plus you're "joining" stream_2 back to stream_1, so that the graph (template) has both its source and sink vertices on stream_1.

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

cuda - Trying to end stream capture fails due to "unjoined work"; but synchronizing fails when capture is in p

1 Answer 1

tl;dr: You recorded an event but did not have any stream wait for it.

与本文相关的文章

评论列表(0)