3

I have a simple function gpu_allocate() to helps allocate memory on GPU (CUDA):

template <typename T> T *gpu_allocate() {
  T *data;
  cudaMallocManaged(&data, sizeof(T));

  return data;
}

struct Circle {
  double radius;
  Circle(double _radius) : radius(_radius) {}
};

struct Quad {
  double x, y;
  Quad(double _x, double _y) : x(_x), y(_y) {}
};

int main() {
  auto circle = gpu_allocate<Circle>();
  *circle = Circle(0.3);

  auto quad = gpu_allocate<Quad>();
  *quad = Quad(0.3, 0.2);

  return 0;
}

After allocating data, I usually have to initialize this newly created object with a constructor. Is there a way to combine these constructor (with different types and numbers of parameters) into allocating function so I can have:

template <typename T> T *gpu_allocate(Parameters parameters) {
  T *data;
  cudaMallocManaged(&data, sizeof(T));
  *data = T(parameters);

  return data;
}

auto circle = gpu_allocate<Circle>(0.3);
auto quad = gpu_allocate<Quad>(0.1, 0.2);

Is this possible?

5
  • 2
    Your pseudo-code is a bit too pseudo, but sure. Does Parameters have to be a hardcoded type? Is that a CUDA thing? Otherwise, use a parameter pack. template<class... Params> T* gpu_allocate(Params&&... params) { T *data; cudaMallocManaged(&data, sizeof *data); return new(static_cast<void*>(data)) T(std::forward<Params>(params)...); } (untested) Commented Aug 9 at 0:25
  • 2
    For more context: You want "placement new". It will construct an object in raw memory. What you are doing is constructing an object and assigning it which can fail for more complex classes. Commented Aug 9 at 0:27
  • @TedLyngmo yes parameter pack is basically what I was looking for but I didn't know how to describe it. Could you provide a working example? Commented Aug 9 at 0:46
  • 1
    You could also go with what is already there in the CUDA Toolkit and use thrust::universal_vector. Commented Aug 9 at 0:55
  • The idiomatic way is to overload operator new. Commented Aug 9 at 4:47

1 Answer 1

6

It looks like you need a function taking a parameter pack and then uses placement-new in the allocated memory.

You should however not return a raw pointer to the allocated memory. Return a unique_ptr that will call the destructor on the object and free the allocated memory when it goes out of scope (if it still owns the pointer at that time).

Example of what the unique_ptr could look like:

#include <memory>
#include <type_traits>
#include <utility>

// a deleter for objects created in memory allocated by cudaMallocManaged
template <class T>
struct CudaDeleter {
    void operator()(T* ptr) const {
        ptr->~T();     // call destructor
        cudaFree(ptr);
    }
};

// helper alias:
template <class T, class Deleter = CudaDeleter<T>>
using cuda_unique_ptr = std::unique_ptr<T, Deleter>;

And then a possible implementation of the function, here renamed cuda_make_unique:

// your function taking a parameter pack:
template <class T, class... Params>
std::enable_if_t<!std::is_array_v<T>, cuda_unique_ptr<T>>
cuda_make_unique(Params&&... params) {
    void* data;

    cudaError_t res = cudaMallocManaged(&data, sizeof(T));
    if (res != cudaError_t::cudaSuccess) throw std::bad_alloc();

    // ... and uses placement-new to create the object and stores the pointer
    // in a `cuda_unique_ptr`:
    try {
        return
            cuda_unique_ptr<T>(new (data) T(std::forward<Params>(params)...));
    } catch (...) {
        cudaFree(data);
        throw;
    }
}

If you don't want the try-catch-rethrow pattern around the potentially throwing new, you could wrap the raw pointer in a helper class that frees the memory in case new throws an exception:

template <class T, class... Params>
std::enable_if_t<!std::is_array_v<T>, cuda_unique_ptr<T>>
cuda_make_unique(Params&&... params) {
    struct in_case_new_throws {
        in_case_new_throws() {
            cudaError_t res = cudaMallocManaged(&m_ptr, sizeof(T));
            if (res != cudaError_t::cudaSuccess) throw std::bad_alloc();
        }
        ~in_case_new_throws() {
            // if m_ptr is not nullptr, then new threw an exception
            // and we need to free the allocated memory
            if (m_ptr) cudaFree(m_ptr);
        }
        cuda_unique_ptr<T> create(Params&&... ps) {
            T* obj = new (m_ptr) T(std::forward<Params>(ps)...);
            m_ptr = nullptr; // new didn't throw
            return cuda_unique_ptr<T>(obj);
        }    
        void* m_ptr;
    } storage;
    
    return storage.create(std::forward<Params>(params)...);
}

Usage:

struct Foo {
    Foo(int, int, int) {}
};

int main() {
    // std::unique_ptr<Foo, CudaDeleter<Foo>> objPtr = ...
    // cuda_unique_ptr<Foo> objPtr = ...
    auto objPtr = cuda_make_unique<Foo>(1, 2, 3); 

    // use objPtr here ...

} // the created Foo is destroyed automatically here
Sign up to request clarification or add additional context in comments.

2 Comments

Thanks for your answer! Is it possible to build a similiar cuda_shared_ptr based on std::shared_ptr?
@Rahn Without thinking about it too long, I'd say yes. Why would you want a shared version though?

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.