MFEM v4.9.0
Finite element discretization library
Loading...
Searching...
No Matches
scan.hpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#ifndef MFEM_SCAN_HPP
13#define MFEM_SCAN_HPP
14
15#include "backends.hpp"
16#include "forall.hpp"
17
18#ifdef MFEM_USE_CUDA
19#include <cub/device/device_scan.cuh>
20#include <cub/device/device_select.cuh>
21#define MFEM_CUB_NAMESPACE cub
22#elif defined(MFEM_USE_HIP)
23#include <hipcub/device/device_scan.hpp>
24#include <hipcub/device/device_select.hpp>
25#define MFEM_CUB_NAMESPACE hipcub
26#endif
27
28#include <algorithm>
29#include <functional>
30#include <numeric>
31#include <cstddef>
32
33namespace mfem
34{
35/// Equivalent to InclusiveScan(use_dev, d_in, d_out, num_items, workspace,
36/// std::plus<>{})
37template <class InputIt, class OutputIt>
38void InclusiveScan(bool use_dev, InputIt d_in, OutputIt d_out, size_t num_items)
39{
40 // forward to InclusiveSum for potentially faster kernels
41#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
43 {
44 static Array<std::byte> workspace;
45 size_t bytes = workspace.Size();
46 if (bytes)
47 {
48 auto err = MFEM_CUB_NAMESPACE::DeviceScan::InclusiveSum(
49 workspace.Write(), bytes, d_in, d_out, num_items);
50#if defined(MFEM_USE_CUDA)
51 if (err == cudaSuccess)
52 {
53 return;
54 }
55#elif defined(MFEM_USE_HIP)
56 if (err == hipSuccess)
57 {
58 return;
59 }
60#endif
61 }
62 // try allocating a larger buffer
63 bytes = 0;
64 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceScan::InclusiveSum(
65 nullptr, bytes, d_in, d_out, num_items));
66 workspace.SetSize(bytes);
67 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceScan::InclusiveSum(
68 workspace.Write(), bytes, d_in, d_out, num_items));
69 return;
70 }
71#endif
72#if 0
73 std::inclusive_scan(d_in, d_in + num_items, d_out);
74#else
75 // work-around to some compilers not fully supporting C++17
76 if (num_items)
77 {
78 *d_out = *d_in;
79 auto prev = d_out;
80 ++d_in;
81 ++d_out;
82 for (size_t i = 1; i < num_items; ++i)
83 {
84 *d_out = (*prev) + (*d_in);
85 prev = d_out;
86 ++d_in;
87 ++d_out;
88 }
89 }
90#endif
91}
92
93/// @brief Performs an inclusive scan of [d_in, d_in+num_items) -> [d_out,
94/// d_out+num_items). This call is potentially asynchronous on the device.
95///
96/// @a d_in input start.
97/// @a d_out output start. Can perform in-place scans with d_out = d_in
98/// @a scan_op binary scan functor. Must be associative. If only weakly
99/// associative (i.e. floating point addition) results are not deterministic. On
100/// device this must also be commutative.
101template <class InputIt, class OutputIt, class ScanOp>
102void InclusiveScan(bool use_dev, InputIt d_in, OutputIt d_out, size_t num_items,
103 ScanOp scan_op)
104{
105#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
107 {
108 static Array<std::byte> workspace;
109 size_t bytes = workspace.Size();
110 if (bytes)
111 {
112 auto err = MFEM_CUB_NAMESPACE::DeviceScan::InclusiveScan(
113 workspace.Write(), bytes, d_in, d_out, scan_op, num_items);
114#if defined(MFEM_USE_CUDA)
115 if (err == cudaSuccess)
116 {
117 return;
118 }
119#elif defined(MFEM_USE_HIP)
120 if (err == hipSuccess)
121 {
122 return;
123 }
124#endif
125 }
126 // try allocating a larger buffer
127 bytes = 0;
128 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceScan::InclusiveScan(
129 nullptr, bytes, d_in, d_out, scan_op, num_items));
130 workspace.SetSize(bytes);
131 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceScan::InclusiveScan(
132 workspace.Write(), bytes, d_in, d_out, scan_op, num_items));
133 return;
134 }
135#endif
136#if 0
137 std::inclusive_scan(d_in, d_in + num_items, d_out, scan_op);
138#else
139 // work-around to some compilers not fully supporting C++17
140 if (num_items)
141 {
142 *d_out = *d_in;
143 auto prev = d_out;
144 ++d_in;
145 ++d_out;
146 for (size_t i = 1; i < num_items; ++i)
147 {
148 *d_out = scan_op(*prev, *d_in);
149 prev = d_out;
150 ++d_in;
151 ++d_out;
152 }
153 }
154#endif
155}
156
157/// Performs an exclusive scan of [d_in, d_in+num_items) -> [d_out,
158/// d_out+num_items). This call is potentially asynchronous on the device.
159/// @a d_in input start.
160/// @a d_out output start. Can perform in-place scans with d_out = d_in
161/// @a scan_op binary scan functor. Must be associative. If only weakly
162/// associative (i.e. floating point addition) results are not deterministic. On
163/// device this must also be commutative.
164template <class InputIt, class OutputIt, class T, class ScanOp>
165void ExclusiveScan(bool use_dev, InputIt d_in, OutputIt d_out, size_t num_items,
166 T init_value, ScanOp scan_op)
167{
168#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
170 {
171 static Array<std::byte> workspace;
172 size_t bytes = workspace.Size();
173 if (bytes)
174 {
175 auto err = MFEM_CUB_NAMESPACE::DeviceScan::ExclusiveScan(
176 workspace.Write(), bytes, d_in, d_out, scan_op, init_value,
177 num_items);
178#if defined(MFEM_USE_CUDA)
179 if (err == cudaSuccess)
180 {
181 return;
182 }
183#elif defined(MFEM_USE_HIP)
184 if (err == hipSuccess)
185 {
186 return;
187 }
188#endif
189 }
190 // try allocating a larger buffer
191 bytes = 0;
192 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceScan::ExclusiveScan(
193 nullptr, bytes, d_in, d_out, scan_op, init_value, num_items));
194 workspace.SetSize(bytes);
195 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceScan::ExclusiveScan(
196 workspace.Write(), bytes, d_in, d_out, scan_op, init_value,
197 num_items));
198 return;
199 }
200#endif
201#if 0
202 std::exclusive_scan(d_in, d_in + num_items, d_out, init_value, scan_op);
203#else
204 // work-around to some compilers not fully supporting C++17
205 if (num_items)
206 {
207 for (size_t i = 0; i < num_items; ++i)
208 {
209 auto next = scan_op(init_value, *d_in);
210 *d_out = init_value;
211 init_value = next;
212 ++d_out;
213 ++d_in;
214 }
215 }
216#endif
217}
218
219/// Equivalent to ExclusiveScan(use_dev, d_in, d_out, num_items, init_value,
220/// workspace, std::plus<>{})
221template <class InputIt, class OutputIt, class T>
222void ExclusiveScan(bool use_dev, InputIt d_in, OutputIt d_out, size_t num_items,
223 T init_value)
224{
225 ExclusiveScan(use_dev, d_in, d_out, num_items, init_value, std::plus<> {});
226}
227
228/// @brief Equivalent to *d_num_selected_out = std::copy_if(d_in,
229/// d_in+num_items, d_out, [=](auto iter){ return d_flags[iter-d_in]; }) -
230/// d_out;
231///
232/// None of the following ranges may overlap:
233/// - [d_in, d_in+num_items)
234/// - [d_flags, d_flags+num_items)
235/// - [d_out, d_out+*d_num_selected_out)
236/// - [d_num_selected_out, d_num_selected_out+1)
237template <class InputIt, class FlagIt, class OutputIt, class NumSelectedIt>
238void CopyFlagged(bool use_dev, InputIt d_in, FlagIt d_flags, OutputIt d_out,
239 NumSelectedIt d_num_selected_out, size_t num_items)
240{
241#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
242 if (use_dev &&
244 {
245 static Array<std::byte> workspace;
246 size_t bytes = workspace.Size();
247 if (bytes)
248 {
249 auto err = MFEM_CUB_NAMESPACE::DeviceSelect::Flagged(
250 workspace.Write(), bytes, d_in, d_flags, d_out, d_num_selected_out,
251 num_items);
252#if defined(MFEM_USE_CUDA)
253 if (err == cudaSuccess)
254 {
255 return;
256 }
257#elif defined(MFEM_USE_HIP)
258 if (err == hipSuccess)
259 {
260 return;
261 }
262#endif
263 }
264 // try allocating a larger buffer
265 bytes = 0;
266 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceSelect::Flagged(
267 nullptr, bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
268 workspace.SetSize(bytes);
269 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceSelect::Flagged(
270 workspace.Write(), bytes, d_in, d_flags, d_out, d_num_selected_out,
271 num_items));
272 return;
273 }
274#endif
275 *d_num_selected_out = 0;
276 for (size_t i = 0; i < num_items; ++i, ++d_in, ++d_flags)
277 {
278 if (*d_flags)
279 {
280 *d_out = *d_in;
281 ++d_out;
282 ++*d_num_selected_out;
283 }
284 }
285}
286
287/// @brief Equivalent to *d_num_selected_out = std::copy_if(d_in,
288/// d_in+num_items, d_out, select_op) - d_out;
289///
290/// None of the following ranges may overlap:
291/// - [d_in, d_in+num_items)
292/// - [d_out, d_out+*d_num_selected_out)
293/// - [d_num_selected_out, d_num_selected_out+1)
294template <class InputIt, class OutputIt, class NumSelectedIt, class SelectOp>
295void CopyIf(bool use_dev, InputIt d_in, OutputIt d_out,
296 NumSelectedIt d_num_selected_out, size_t num_items,
297 SelectOp select_op)
298{
299#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
300 if (use_dev &&
302 {
303#if defined(MFEM_USE_CUDA) && \
304 (__CUDACC_VER_MAJOR__ < 12 || \
305 (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 5))
306 // bug in cuda < 12.5, work-around: use Flagged instead
307 Array<bool> flags(num_items);
308 auto ptr = flags.Write();
309 forall(num_items,
310 [=] MFEM_HOST_DEVICE(int i) { ptr[i] = select_op(d_in[i]); });
311 CopyFlagged(use_dev, d_in, ptr, d_out, d_num_selected_out, num_items);
312#else
313 static Array<std::byte> workspace;
314 size_t bytes = workspace.Size();
315 if (bytes)
316 {
317 auto err = MFEM_CUB_NAMESPACE::DeviceSelect::If(
318 workspace.Write(), bytes, d_in, d_out, d_num_selected_out,
319 num_items, select_op);
320#if defined(MFEM_USE_CUDA)
321 if (err == cudaSuccess)
322 {
323 return;
324 }
325#elif defined(MFEM_USE_HIP)
326 if (err == hipSuccess)
327 {
328 return;
329 }
330#endif
331 }
332 // try allocating a larger buffer
333 bytes = 0;
334 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceSelect::If(
335 nullptr, bytes, d_in, d_out, d_num_selected_out, num_items,
336 select_op));
337 workspace.SetSize(bytes);
338 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceSelect::If(
339 workspace.Write(), bytes, d_in, d_out, d_num_selected_out, num_items,
340 select_op));
341#endif
342 return;
343 }
344#endif
345 *d_num_selected_out = 0;
346 for (size_t i = 0; i < num_items; ++i, ++d_in)
347 {
348 if (select_op(*d_in))
349 {
350 *d_out = *d_in;
351 ++d_out;
352 ++*d_num_selected_out;
353 }
354 }
355}
356
357/// @brief equivalent to *d_num_selected_out = std::unique_copy(d_in,
358/// d_in+num_items, d_out) - d_out;
359///
360/// None of the following ranges may overlap:
361/// - [d_in, d_in+num_items)
362/// - [d_out, d_out+*d_num_selected_out)
363/// - [d_num_selected_out, d_num_selected_out+1)
364template <class InputIt, class OutputIt, class NumSelectedIt>
365void CopyUnique(bool use_dev, InputIt d_in, OutputIt d_out,
366 NumSelectedIt d_num_selected_out, size_t num_items)
367{
368#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
369 if (use_dev &&
371 {
372 static Array<std::byte> workspace;
373 size_t bytes = workspace.Size();
374 if (bytes)
375 {
376 auto err = MFEM_CUB_NAMESPACE::DeviceSelect::Unique(
377 workspace.Write(), bytes, d_in, d_out, d_num_selected_out,
378 num_items);
379#if defined(MFEM_USE_CUDA)
380 if (err == cudaSuccess)
381 {
382 return;
383 }
384#elif defined(MFEM_USE_HIP)
385 if (err == hipSuccess)
386 {
387 return;
388 }
389#endif
390 }
391 // try allocating a larger buffer
392 bytes = 0;
393 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceSelect::Unique(
394 nullptr, bytes, d_in, d_out, d_num_selected_out, num_items));
395 workspace.SetSize(bytes);
396 MFEM_GPU_CHECK(MFEM_CUB_NAMESPACE::DeviceSelect::Unique(
397 workspace.Write(), bytes, d_in, d_out, d_num_selected_out,
398 num_items));
399 return;
400 }
401#endif
402 *d_num_selected_out =
403 std::unique_copy(d_in, d_in + num_items, d_out) - d_out;
404}
405} // namespace mfem
406
407#undef MFEM_CUB_NAMESPACE
408
409#endif
void SetSize(int nsize)
Change the logical size of the array, keep existing entries.
Definition array.hpp:840
int Size() const
Return the logical size of the array.
Definition array.hpp:166
T * Write(bool on_dev=true)
Shortcut for mfem::Write(a.GetMemory(), a.Size(), on_dev).
Definition array.hpp:389
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition device.hpp:262
void ExclusiveScan(bool use_dev, InputIt d_in, OutputIt d_out, size_t num_items, T init_value, ScanOp scan_op)
Definition scan.hpp:165
void CopyFlagged(bool use_dev, InputIt d_in, FlagIt d_flags, OutputIt d_out, NumSelectedIt d_num_selected_out, size_t num_items)
Equivalent to *d_num_selected_out = std::copy_if(d_in, d_in+num_items, d_out, [=](auto iter){ return ...
Definition scan.hpp:238
OutStream err(std::cerr)
Global stream used by the library for standard error output. Initially it uses the same std::streambu...
Definition globals.hpp:71
void CopyUnique(bool use_dev, InputIt d_in, OutputIt d_out, NumSelectedIt d_num_selected_out, size_t num_items)
equivalent to *d_num_selected_out = std::unique_copy(d_in, d_in+num_items, d_out) - d_out;
Definition scan.hpp:365
void InclusiveScan(bool use_dev, InputIt d_in, OutputIt d_out, size_t num_items)
Definition scan.hpp:38
void CopyIf(bool use_dev, InputIt d_in, OutputIt d_out, NumSelectedIt d_num_selected_out, size_t num_items, SelectOp select_op)
Equivalent to *d_num_selected_out = std::copy_if(d_in, d_in+num_items, d_out, select_op) - d_out;.
Definition scan.hpp:295
void forall(int N, lambda &&body)
Definition forall.hpp:839
@ HIP_MASK
Biwise-OR of all HIP backends.
Definition device.hpp:93
@ CUDA_MASK
Biwise-OR of all CUDA backends.
Definition device.hpp:91