19#include <cub/device/device_scan.cuh>
20#include <cub/device/device_select.cuh>
21#define MFEM_CUB_NAMESPACE cub
22#elif defined(MFEM_USE_HIP)
23#include <hipcub/device/device_scan.hpp>
24#include <hipcub/device/device_select.hpp>
25#define MFEM_CUB_NAMESPACE hipcub
37template <
class InputIt,
class OutputIt>
38void InclusiveScan(
bool use_dev, InputIt d_in, OutputIt d_out,
size_t num_items)
41#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
45 size_t bytes = workspace.
Size();
48 auto err = MFEM_CUB_NAMESPACE::DeviceScan::InclusiveSum(
49 workspace.
Write(), bytes, d_in, d_out, num_items);
50#if defined(MFEM_USE_CUDA)
51 if (
err == cudaSuccess)
55#elif defined(MFEM_USE_HIP)
56 if (
err == hipSuccess)
64 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceScan::InclusiveSum(
65 nullptr, bytes, d_in, d_out, num_items));
67 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceScan::InclusiveSum(
68 workspace.
Write(), bytes, d_in, d_out, num_items));
73 std::inclusive_scan(d_in, d_in + num_items, d_out);
82 for (
size_t i = 1; i < num_items; ++i)
84 *d_out = (*prev) + (*d_in);
101template <
class InputIt,
class OutputIt,
class ScanOp>
102void InclusiveScan(
bool use_dev, InputIt d_in, OutputIt d_out,
size_t num_items,
105#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
109 size_t bytes = workspace.
Size();
112 auto err = MFEM_CUB_NAMESPACE::DeviceScan::InclusiveScan(
113 workspace.
Write(), bytes, d_in, d_out, scan_op, num_items);
114#if defined(MFEM_USE_CUDA)
115 if (
err == cudaSuccess)
119#elif defined(MFEM_USE_HIP)
120 if (
err == hipSuccess)
128 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceScan::InclusiveScan(
129 nullptr, bytes, d_in, d_out, scan_op, num_items));
131 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceScan::InclusiveScan(
132 workspace.
Write(), bytes, d_in, d_out, scan_op, num_items));
137 std::inclusive_scan(d_in, d_in + num_items, d_out, scan_op);
146 for (
size_t i = 1; i < num_items; ++i)
148 *d_out = scan_op(*prev, *d_in);
164template <
class InputIt,
class OutputIt,
class T,
class ScanOp>
165void ExclusiveScan(
bool use_dev, InputIt d_in, OutputIt d_out,
size_t num_items,
166 T init_value, ScanOp scan_op)
168#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
172 size_t bytes = workspace.
Size();
175 auto err = MFEM_CUB_NAMESPACE::DeviceScan::ExclusiveScan(
176 workspace.
Write(), bytes, d_in, d_out, scan_op, init_value,
178#if defined(MFEM_USE_CUDA)
179 if (
err == cudaSuccess)
183#elif defined(MFEM_USE_HIP)
184 if (
err == hipSuccess)
192 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceScan::ExclusiveScan(
193 nullptr, bytes, d_in, d_out, scan_op, init_value, num_items));
195 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceScan::ExclusiveScan(
196 workspace.
Write(), bytes, d_in, d_out, scan_op, init_value,
202 std::exclusive_scan(d_in, d_in + num_items, d_out, init_value, scan_op);
207 for (
size_t i = 0; i < num_items; ++i)
209 auto next = scan_op(init_value, *d_in);
221template <
class InputIt,
class OutputIt,
class T>
222void ExclusiveScan(
bool use_dev, InputIt d_in, OutputIt d_out,
size_t num_items,
225 ExclusiveScan(use_dev, d_in, d_out, num_items, init_value, std::plus<> {});
237template <
class InputIt,
class FlagIt,
class OutputIt,
class NumSelectedIt>
238void CopyFlagged(
bool use_dev, InputIt d_in, FlagIt d_flags, OutputIt d_out,
239 NumSelectedIt d_num_selected_out,
size_t num_items)
241#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
246 size_t bytes = workspace.
Size();
249 auto err = MFEM_CUB_NAMESPACE::DeviceSelect::Flagged(
250 workspace.
Write(), bytes, d_in, d_flags, d_out, d_num_selected_out,
252#if defined(MFEM_USE_CUDA)
253 if (
err == cudaSuccess)
257#elif defined(MFEM_USE_HIP)
258 if (
err == hipSuccess)
266 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceSelect::Flagged(
267 nullptr, bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
269 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceSelect::Flagged(
270 workspace.
Write(), bytes, d_in, d_flags, d_out, d_num_selected_out,
275 *d_num_selected_out = 0;
276 for (
size_t i = 0; i < num_items; ++i, ++d_in, ++d_flags)
282 ++*d_num_selected_out;
294template <
class InputIt,
class OutputIt,
class NumSelectedIt,
class SelectOp>
295void CopyIf(
bool use_dev, InputIt d_in, OutputIt d_out,
296 NumSelectedIt d_num_selected_out,
size_t num_items,
299#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
303#if defined(MFEM_USE_CUDA) && \
304 (__CUDACC_VER_MAJOR__ < 12 || \
305 (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 5))
308 auto ptr = flags.
Write();
310 [=] MFEM_HOST_DEVICE(
int i) { ptr[i] = select_op(d_in[i]); });
311 CopyFlagged(use_dev, d_in, ptr, d_out, d_num_selected_out, num_items);
314 size_t bytes = workspace.
Size();
317 auto err = MFEM_CUB_NAMESPACE::DeviceSelect::If(
318 workspace.
Write(), bytes, d_in, d_out, d_num_selected_out,
319 num_items, select_op);
320#if defined(MFEM_USE_CUDA)
321 if (
err == cudaSuccess)
325#elif defined(MFEM_USE_HIP)
326 if (
err == hipSuccess)
334 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceSelect::If(
335 nullptr, bytes, d_in, d_out, d_num_selected_out, num_items,
338 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceSelect::If(
339 workspace.
Write(), bytes, d_in, d_out, d_num_selected_out, num_items,
345 *d_num_selected_out = 0;
346 for (
size_t i = 0; i < num_items; ++i, ++d_in)
348 if (select_op(*d_in))
352 ++*d_num_selected_out;
364template <
class InputIt,
class OutputIt,
class NumSelectedIt>
366 NumSelectedIt d_num_selected_out,
size_t num_items)
368#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
373 size_t bytes = workspace.
Size();
376 auto err = MFEM_CUB_NAMESPACE::DeviceSelect::Unique(
377 workspace.
Write(), bytes, d_in, d_out, d_num_selected_out,
379#if defined(MFEM_USE_CUDA)
380 if (
err == cudaSuccess)
384#elif defined(MFEM_USE_HIP)
385 if (
err == hipSuccess)
393 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceSelect::Unique(
394 nullptr, bytes, d_in, d_out, d_num_selected_out, num_items));
396 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceSelect::Unique(
397 workspace.
Write(), bytes, d_in, d_out, d_num_selected_out,
402 *d_num_selected_out =
403 std::unique_copy(d_in, d_in + num_items, d_out) - d_out;
407#undef MFEM_CUB_NAMESPACE
void SetSize(int nsize)
Change the logical size of the array, keep existing entries.
int Size() const
Return the logical size of the array.
T * Write(bool on_dev=true)
Shortcut for mfem::Write(a.GetMemory(), a.Size(), on_dev).
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
void ExclusiveScan(bool use_dev, InputIt d_in, OutputIt d_out, size_t num_items, T init_value, ScanOp scan_op)
void CopyFlagged(bool use_dev, InputIt d_in, FlagIt d_flags, OutputIt d_out, NumSelectedIt d_num_selected_out, size_t num_items)
Equivalent to *d_num_selected_out = std::copy_if(d_in, d_in+num_items, d_out, [=](auto iter){ return ...
OutStream err(std::cerr)
Global stream used by the library for standard error output. Initially it uses the same std::streambu...
void CopyUnique(bool use_dev, InputIt d_in, OutputIt d_out, NumSelectedIt d_num_selected_out, size_t num_items)
equivalent to *d_num_selected_out = std::unique_copy(d_in, d_in+num_items, d_out) - d_out;
void InclusiveScan(bool use_dev, InputIt d_in, OutputIt d_out, size_t num_items)
void CopyIf(bool use_dev, InputIt d_in, OutputIt d_out, NumSelectedIt d_num_selected_out, size_t num_items, SelectOp select_op)
Equivalent to *d_num_selected_out = std::copy_if(d_in, d_in+num_items, d_out, select_op) - d_out;.
void forall(int N, lambda &&body)
@ HIP_MASK
Biwise-OR of all HIP backends.
@ CUDA_MASK
Biwise-OR of all CUDA backends.