CUDA `cudaMemcpyBatchAsync` "invalid argument"

Question

I'm consistently encountering an "invalid argument" error when calling cudaMemcpyBatchAsync for host-to-device transfers.

CUDA error at btest.cu:43 - invalid argument

Line 43 is CUDA_CHECK(cudaMemcpyBatchAsync(...)). Line 43 is CUDA_CHECK(cudaMemcpyBatchAsync(...)). The API signature and description are from the official documentation: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1gc02716b3bd21f3d83640ab102bf089f9

Code:

#include <cstdint>
#include <iostream>
#include <vector>

// CUDA error checking macro
#define CUDA_CHECK(call) \
   {\
   const cudaError_t err = call; \
   if (err != cudaSuccess) { \
     fprintf(stderr, "CUDA error at %s:%d - %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
     exit(EXIT_FAILURE); \
   } \
   } \

int main() {
    // --- Configuration ---
    const int NUM_COPIES = 1000;
    const size_t COPY_SIZE_BYTES = 1024;

    // --- Allocate Host and Device Memory ---
    // We need multiple host and device pointers for individual copies
    std::vector<void*> h_src;
    std::vector<void*> d_dst;
    std::vector<size_t> sizes(NUM_COPIES, COPY_SIZE_BYTES);

    for (int i = 0; i < NUM_COPIES; ++i) {
        void* ptr_h, *ptr_d;
        CUDA_CHECK(cudaMallocHost(&ptr_h, COPY_SIZE_BYTES));
        CUDA_CHECK(cudaMalloc    (&ptr_d, COPY_SIZE_BYTES));
        h_src.push_back(ptr_h);
        d_dst.push_back(ptr_d);
    }

    std::vector<cudaMemcpyAttributes> attrs(1);
    attrs[0].srcLocHint.type = cudaMemLocationTypeHost;
    attrs[0].dstLocHint.type = cudaMemLocationTypeDevice;
    attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderAny;
    attrs[0].flags = 0;
    std::vector<size_t> attrsIdxs = {0};
    size_t numAttrs = attrs.size();

    size_t fail_idx=0; // Variable to store the index of the failed copy if any
    CUDA_CHECK(cudaMemcpyBatchAsync(
        d_dst.data(),
        h_src.data(),
        sizes.data(),
        NUM_COPIES,
        attrs.data(),
        attrsIdxs.data(),
        numAttrs,
        &fail_idx,
        0        // Default stream
    ));

    if( fail_idx!=SIZE_MAX ) throw std::runtime_error("Failed MemcpyBatchAsync at fail_idx = " + std::to_string(fail_idx) + "\n");

    // --- Cleanup ---
    for (int i = 0; i < NUM_COPIES; ++i) {
        CUDA_CHECK(cudaFreeHost(h_src[i]));
        CUDA_CHECK(cudaFree(d_dst[i]));
    }

    return 0;
}

Environment:

GPU: NVIDIA GeForce RTX 5090
Driver Version: 575.57.08
CUDA Version (from nvidia-smi): 12.9
Compilation: /usr/local/cuda-12.9/bin/nvcc -arch=sm_90 btest.cu -o a.out
Operating System: Ubuntu 24.04

Also tested for RTX3090 and CUDA 12.8 with compilation for sm_86. Same "invalid argument" error.

What else could cause an "invalid argument" error for cudaMemcpyBatchAsync in this scenario? Are there any subtle requirements or unusual environmental factors I might be missing?

You didn't initialize attrs[0].dstLocHint.id to the device number. Same with srcLocHint.id but that's ignored for cudaMemLocationTypeHost — Homer512
– Homer512, Commented Jul 1 at 6:02

sniper_elite · Accepted Answer · 2025-07-01 19:07:40Z

5

The likely problem is the stream argument cannot be 0 (i.e. default stream). You will need to specify a named stream that was created with cudaStreamCreate*()

You also don't have to specify the location hints because "The cudaMemcpyAttributes::srcLocHint and cudaMemcpyAttributes::dstLocHint allows applications to specify hint locations for operands of a copy when the operand doesn't have a fixed location. That is, these hints are only applicable for managed memory pointers on devices where cudaDevAttrConcurrentManagedAccess is true or system-allocated pageable memory on devices where cudaDevAttrPageableMemoryAccess is true."

answered Jul 1 at 19:07

sniper_elite

662 bronze badges

Sign up to request clarification or add additional context in comments.

2 Comments

Community Jul 1 at 20:13

As it’s currently written, your answer is unclear. Please edit to add additional details that will help others understand how this addresses the question asked. You can find more information on how to write good answers in the help center.

Robert Crovella Jul 1 at 22:57

Well spotted.

Collectives™ on Stack Overflow

CUDA `cudaMemcpyBatchAsync` "invalid argument"

1 Answer 1

2 Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

2 Comments

Your Answer

Sign up or log in

Post as a guest

Related