MFEM v4.8.0
Finite element discretization library
Loading...
Searching...
No Matches
forall.hpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#ifndef MFEM_FORALL_HPP
13#define MFEM_FORALL_HPP
14
15#include "../config/config.hpp"
16#include "annotation.hpp"
17#include "error.hpp"
18#include "backends.hpp"
19#include "device.hpp"
20#include "mem_manager.hpp"
21#include "../linalg/dtensor.hpp"
22#ifdef MFEM_USE_MPI
23#include <_hypre_utilities.h>
24#endif
25
26#include "array.hpp"
27#include "reducers.hpp"
28
29namespace mfem
30{
31
32// The following DofQuadLimit_ structs define the maximum values of D1D and Q1D
33// often used in the "fallback kernels" for partial assembly. Different limits
34// take effect for different architectures. The limits should be queried using
35// the public interface in DeviceDofQuadLimits or DofQuadLimits, and generally
36// not be directly accessing the structs defined below.
37//
38// In host code, the limits associated with the currently configured Device can
39// be accessed using DeviceDofQuadLimits::Get().
40//
41// In mfem::forall kernels or MFEM_HOST_DEVICE functions, the limits
42// corresponding to the architecture the function is being compiled for can be
43// accessed as static constexpr variables using the type alias DofQuadLimits.
44
45namespace internal
46{
47
48struct DofQuadLimits_CUDA
49{
50 static constexpr int MAX_D1D = 14;
51 static constexpr int MAX_Q1D = 14;
52 static constexpr int HCURL_MAX_D1D = 5;
53 static constexpr int HCURL_MAX_Q1D = 6;
54 static constexpr int HDIV_MAX_D1D = 5;
55 static constexpr int HDIV_MAX_Q1D = 6;
56 static constexpr int MAX_INTERP_1D = 8;
57 static constexpr int MAX_DET_1D = 6;
58};
59
60struct DofQuadLimits_HIP
61{
62 static constexpr int MAX_D1D = 10;
63 static constexpr int MAX_Q1D = 10;
64 static constexpr int HCURL_MAX_D1D = 5;
65 static constexpr int HCURL_MAX_Q1D = 5;
66 static constexpr int HDIV_MAX_D1D = 5;
67 static constexpr int HDIV_MAX_Q1D = 6;
68 static constexpr int MAX_INTERP_1D = 8;
69 static constexpr int MAX_DET_1D = 6;
70};
71
72struct DofQuadLimits_CPU
73{
74#ifndef _WIN32
75 static constexpr int MAX_D1D = 24;
76 static constexpr int MAX_Q1D = 24;
77#else
78 static constexpr int MAX_D1D = 14;
79 static constexpr int MAX_Q1D = 14;
80#endif
81 static constexpr int HCURL_MAX_D1D = 10;
82 static constexpr int HCURL_MAX_Q1D = 10;
83 static constexpr int HDIV_MAX_D1D = 10;
84 static constexpr int HDIV_MAX_Q1D = 10;
85 static constexpr int MAX_INTERP_1D = MAX_D1D;
86 static constexpr int MAX_DET_1D = MAX_D1D;
87};
88
89} // namespace internal
90
91/// @brief Maximum number of 1D DOFs or quadrature points for the architecture
92/// currently being compiled for (used in fallback kernels).
93///
94/// DofQuadLimits provides access to the limits as static constexpr member
95/// variables for use in mfem::forall kernels or MFEM_HOST_DEVICE functions.
96///
97/// @sa For accessing the limits according to the runtime configuration of the
98/// Device, see DeviceDofQuadLimits.
99#if defined(__CUDA_ARCH__)
100using DofQuadLimits = internal::DofQuadLimits_CUDA;
101#elif defined(__HIP_DEVICE_COMPILE__)
102using DofQuadLimits = internal::DofQuadLimits_HIP;
103#else
104using DofQuadLimits = internal::DofQuadLimits_CPU;
105#endif
106
107/// @brief Maximum number of 1D DOFs or quadrature points for the current
108/// runtime configuration of the Device (used in fallback kernels).
109///
110/// DeviceDofQuadLimits can be used in host code to query the limits for the
111/// configured device (e.g. when the user has selected GPU execution at
112/// runtime).
113///
114/// @sa For accessing the limits according to the current compiler pass, see
115/// DofQuadLimits.
117{
118 int MAX_D1D; ///< Maximum number of 1D nodal points.
119 int MAX_Q1D; ///< Maximum number of 1D quadrature points.
120 int HCURL_MAX_D1D; ///< Maximum number of 1D nodal points for H(curl).
121 int HCURL_MAX_Q1D; ///< Maximum number of 1D quadrature points for H(curl).
122 int HDIV_MAX_D1D; ///< Maximum number of 1D nodal points for H(div).
123 int HDIV_MAX_Q1D; ///< Maximum number of 1D quadrature points for H(div).
124 int MAX_INTERP_1D; ///< Maximum number of points for use in QuadratureInterpolator.
125 int MAX_DET_1D; ///< Maximum number of points for determinant computation in QuadratureInterpolator.
126
127 /// Return a const reference to the DeviceDofQuadLimits singleton.
128 static const DeviceDofQuadLimits &Get()
129 {
130 static const DeviceDofQuadLimits dof_quad_limits;
131 return dof_quad_limits;
132 }
133
134private:
135 /// Initialize the limits depending on the configuration of the Device.
137 {
138 if (Device::Allows(Backend::CUDA_MASK)) { Populate<internal::DofQuadLimits_CUDA>(); }
139 else if (Device::Allows(Backend::HIP_MASK)) { Populate<internal::DofQuadLimits_HIP>(); }
140 else { Populate<internal::DofQuadLimits_CPU>(); }
141 }
142
143 /// @brief Set the limits using the static members of the type @a T.
144 ///
145 /// @a T should be one of DofQuadLimits_CUDA, DofQuadLimits_HIP, or
146 /// DofQuadLimits_CPU.
147 template <typename T> void Populate()
148 {
149 MAX_D1D = T::MAX_D1D;
150 MAX_Q1D = T::MAX_Q1D;
151 HCURL_MAX_D1D = T::HCURL_MAX_D1D;
152 HCURL_MAX_Q1D = T::HCURL_MAX_Q1D;
153 HDIV_MAX_D1D = T::HDIV_MAX_D1D;
154 HDIV_MAX_Q1D = T::HDIV_MAX_Q1D;
155 MAX_INTERP_1D = T::MAX_INTERP_1D;
156 MAX_DET_1D = T::MAX_DET_1D;
157 }
158};
159
160// MFEM pragma macros that can be used inside MFEM_FORALL macros.
161#define MFEM_PRAGMA(X) _Pragma(#X)
162
163// MFEM_UNROLL pragma macro that can be used inside MFEM_FORALL macros.
164#if defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__)
165#define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N))
166#else
167#define MFEM_UNROLL(N)
168#endif
169
170// MFEM_GPU_FORALL: "parallel for" executed with CUDA or HIP based on the MFEM
171// build-time configuration (MFEM_USE_CUDA or MFEM_USE_HIP). If neither CUDA nor
172// HIP is enabled, this macro is a no-op.
173#if defined(MFEM_USE_CUDA)
174#define MFEM_GPU_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE \
175 (int i) {__VA_ARGS__})
176#elif defined(MFEM_USE_HIP)
177#define MFEM_GPU_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE \
178 (int i) {__VA_ARGS__})
179#else
180#define MFEM_GPU_FORALL(i, N,...) do { } while (false)
181#endif
182
183// Implementation of MFEM's "parallel for" (forall) device/host kernel
184// interfaces supporting RAJA, CUDA, OpenMP, and sequential backends.
185
186// The MFEM_FORALL wrapper
187#define MFEM_FORALL(i,N,...) \
188 ForallWrap<1>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})
189
190// MFEM_FORALL with a 2D CUDA block
191#define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
192 ForallWrap<2>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,BZ)
193
194// MFEM_FORALL with a 3D CUDA block
195#define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
196 ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z)
197
198// MFEM_FORALL with a 3D CUDA block and grid
199// With G=0, this is the same as MFEM_FORALL_3D(i,N,X,Y,Z,...)
200#define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \
201 ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z,G)
202
203// MFEM_FORALL that uses the basic CPU backend when use_dev is false. See for
204// example the functions in vector.cpp, where we don't want to use the mfem
205// device for operations on small vectors.
206#define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
207 ForallWrap<1>(use_dev,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})
208
209
210/// OpenMP backend
211template <typename HBODY>
212void OmpWrap(const int N, HBODY &&h_body)
213{
214#ifdef MFEM_USE_OPENMP
215 #pragma omp parallel for
216 for (int k = 0; k < N; k++)
217 {
218 h_body(k);
219 }
220#else
221 MFEM_CONTRACT_VAR(N);
222 MFEM_CONTRACT_VAR(h_body);
223 MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");
224#endif
225}
226
227
228/// RAJA Cuda and Hip backends
229#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
231 RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;
233 RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
235 RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>;
236#endif
237
238#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
240 RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>;
242 RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
244 RAJA::LoopPolicy<RAJA::hip_thread_z_direct>;
245#endif
246
247#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
248template <const int BLOCKS = MFEM_CUDA_BLOCKS, typename DBODY>
249void RajaCuWrap1D(const int N, DBODY &&d_body)
250{
251 //true denotes asynchronous kernel
252 RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
253}
254
255template <typename DBODY>
256void RajaCuWrap2D(const int N, DBODY &&d_body,
257 const int X, const int Y, const int BZ)
258{
259 MFEM_VERIFY(BZ>0, "");
260 const int G = (N+BZ-1)/BZ;
261
262 using namespace RAJA;
263 using RAJA::RangeSegment;
264
265 launch<cuda_launch_policy>
266 (LaunchParams(Teams(G), Threads(X, Y, BZ)),
267 [=] RAJA_DEVICE (LaunchContext ctx)
268 {
269
270 loop<cuda_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
271 {
272
273 loop<cuda_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
274 {
275
276 const int k = n*BZ + tz;
277 if (k >= N) { return; }
278 d_body(k);
279
280 });
281
282 });
283
284 });
285
286 MFEM_GPU_CHECK(cudaGetLastError());
287}
288
289template <typename DBODY>
290void RajaCuWrap3D(const int N, DBODY &&d_body,
291 const int X, const int Y, const int Z, const int G)
292{
293 const int GRID = G == 0 ? N : G;
294 using namespace RAJA;
295 using RAJA::RangeSegment;
296
297 launch<cuda_launch_policy>
298 (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
299 [=] RAJA_DEVICE (LaunchContext ctx)
300 {
301
302 loop<cuda_teams_x>(ctx, RangeSegment(0, N), d_body);
303
304 });
305
306 MFEM_GPU_CHECK(cudaGetLastError());
307}
308
309template <int Dim>
311
312template <>
313struct RajaCuWrap<1>
314{
315 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
316 static void run(const int N, DBODY &&d_body,
317 const int X, const int Y, const int Z, const int G)
318 {
319 RajaCuWrap1D<BLCK>(N, d_body);
320 }
321};
322
323template <>
324struct RajaCuWrap<2>
325{
326 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
327 static void run(const int N, DBODY &&d_body,
328 const int X, const int Y, const int Z, const int G)
329 {
330 RajaCuWrap2D(N, d_body, X, Y, Z);
331 }
332};
333
334template <>
335struct RajaCuWrap<3>
336{
337 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
338 static void run(const int N, DBODY &&d_body,
339 const int X, const int Y, const int Z, const int G)
340 {
341 RajaCuWrap3D(N, d_body, X, Y, Z, G);
342 }
343};
344
345#endif
346
347#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
348template <const int BLOCKS = MFEM_HIP_BLOCKS, typename DBODY>
349void RajaHipWrap1D(const int N, DBODY &&d_body)
350{
351 //true denotes asynchronous kernel
352 RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
353}
354
355template <typename DBODY>
356void RajaHipWrap2D(const int N, DBODY &&d_body,
357 const int X, const int Y, const int BZ)
358{
359 MFEM_VERIFY(BZ>0, "");
360 const int G = (N+BZ-1)/BZ;
361
362 using namespace RAJA;
363 using RAJA::RangeSegment;
364
365 launch<hip_launch_policy>
366 (LaunchParams(Teams(G), Threads(X, Y, BZ)),
367 [=] RAJA_DEVICE (LaunchContext ctx)
368 {
369
370 loop<hip_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
371 {
372
373 loop<hip_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
374 {
375
376 const int k = n*BZ + tz;
377 if (k >= N) { return; }
378 d_body(k);
379
380 });
381
382 });
383
384 });
385
386 MFEM_GPU_CHECK(hipGetLastError());
387}
388
389template <typename DBODY>
390void RajaHipWrap3D(const int N, DBODY &&d_body,
391 const int X, const int Y, const int Z, const int G)
392{
393 const int GRID = G == 0 ? N : G;
394 using namespace RAJA;
395 using RAJA::RangeSegment;
396
397 launch<hip_launch_policy>
398 (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
399 [=] RAJA_DEVICE (LaunchContext ctx)
400 {
401
402 loop<hip_teams_x>(ctx, RangeSegment(0, N), d_body);
403
404 });
405
406 MFEM_GPU_CHECK(hipGetLastError());
407}
408
409template <int Dim>
411
412template <>
413struct RajaHipWrap<1>
414{
415 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
416 static void run(const int N, DBODY &&d_body,
417 const int X, const int Y, const int Z, const int G)
418 {
419 RajaHipWrap1D<BLCK>(N, d_body);
420 }
421};
422
423template <>
424struct RajaHipWrap<2>
425{
426 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
427 static void run(const int N, DBODY &&d_body,
428 const int X, const int Y, const int Z, const int G)
429 {
430 RajaHipWrap2D(N, d_body, X, Y, Z);
431 }
432};
433
434template <>
435struct RajaHipWrap<3>
436{
437 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
438 static void run(const int N, DBODY &&d_body,
439 const int X, const int Y, const int Z, const int G)
440 {
441 RajaHipWrap3D(N, d_body, X, Y, Z, G);
442 }
443};
444
445#endif
446
447/// RAJA OpenMP backend
448#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
449
450template <typename HBODY>
451void RajaOmpWrap(const int N, HBODY &&h_body)
452{
453 RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
454}
455
456#endif
457
458
459/// RAJA sequential loop backend
460template <typename HBODY>
461void RajaSeqWrap(const int N, HBODY &&h_body)
462{
463#ifdef MFEM_USE_RAJA
464
465#if (RAJA_VERSION_MAJOR >= 2023)
466 //loop_exec was marked deprecated in RAJA version 2023.06.0
467 //and will be removed. We now use seq_exec.
468 using raja_forall_pol = RAJA::seq_exec;
469#else
470 using raja_forall_pol = RAJA::loop_exec;
471#endif
472
473 RAJA::forall<raja_forall_pol>(RAJA::RangeSegment(0,N), h_body);
474#else
475 MFEM_CONTRACT_VAR(N);
476 MFEM_CONTRACT_VAR(h_body);
477 MFEM_ABORT("RAJA requested but RAJA is not enabled!");
478#endif
479}
480
481
482/// CUDA backend
483#ifdef MFEM_USE_CUDA
484
485template <typename BODY> __global__ static
486void CuKernel1D(const int N, BODY body)
487{
488 const int k = blockDim.x*blockIdx.x + threadIdx.x;
489 if (k >= N) { return; }
490 body(k);
491}
492
493template <typename BODY> __global__ static
494void CuKernel2D(const int N, BODY body)
495{
496 const int k = blockIdx.x*blockDim.z + threadIdx.z;
497 if (k >= N) { return; }
498 body(k);
499}
500
501template <typename BODY> __global__ static
502void CuKernel3D(const int N, BODY body)
503{
504 for (int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }
505}
506
507template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
508void CuWrap1D(const int N, DBODY &&d_body)
509{
510 if (N==0) { return; }
511 const int GRID = (N+BLCK-1)/BLCK;
512 CuKernel1D<<<GRID,BLCK>>>(N, d_body);
513 MFEM_GPU_CHECK(cudaGetLastError());
514}
515
516template <typename DBODY>
517void CuWrap2D(const int N, DBODY &&d_body,
518 const int X, const int Y, const int BZ)
519{
520 if (N==0) { return; }
521 MFEM_VERIFY(BZ>0, "");
522 const int GRID = (N+BZ-1)/BZ;
523 const dim3 BLCK(X,Y,BZ);
524 CuKernel2D<<<GRID,BLCK>>>(N,d_body);
525 MFEM_GPU_CHECK(cudaGetLastError());
526}
527
528template <typename DBODY>
529void CuWrap3D(const int N, DBODY &&d_body,
530 const int X, const int Y, const int Z, const int G)
531{
532 if (N==0) { return; }
533 const int GRID = G == 0 ? N : G;
534 const dim3 BLCK(X,Y,Z);
535 CuKernel3D<<<GRID,BLCK>>>(N,d_body);
536 MFEM_GPU_CHECK(cudaGetLastError());
537}
538
539template <int Dim>
540struct CuWrap;
541
542template <>
543struct CuWrap<1>
544{
545 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
546 static void run(const int N, DBODY &&d_body,
547 const int X, const int Y, const int Z, const int G)
548 {
549 CuWrap1D<BLCK>(N, d_body);
550 }
551};
552
553template <>
554struct CuWrap<2>
555{
556 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
557 static void run(const int N, DBODY &&d_body,
558 const int X, const int Y, const int Z, const int G)
559 {
560 CuWrap2D(N, d_body, X, Y, Z);
561 }
562};
563
564template <>
565struct CuWrap<3>
566{
567 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
568 static void run(const int N, DBODY &&d_body,
569 const int X, const int Y, const int Z, const int G)
570 {
571 CuWrap3D(N, d_body, X, Y, Z, G);
572 }
573};
574
575#endif // MFEM_USE_CUDA
576
577
578/// HIP backend
579#ifdef MFEM_USE_HIP
580
581template <typename BODY> __global__ static
582void HipKernel1D(const int N, BODY body)
583{
584 const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
585 if (k >= N) { return; }
586 body(k);
587}
588
589template <typename BODY> __global__ static
590void HipKernel2D(const int N, BODY body)
591{
592 const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;
593 if (k >= N) { return; }
594 body(k);
595}
596
597template <typename BODY> __global__ static
598void HipKernel3D(const int N, BODY body)
599{
600 for (int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }
601}
602
603template <const int BLCK = MFEM_HIP_BLOCKS, typename DBODY>
604void HipWrap1D(const int N, DBODY &&d_body)
605{
606 if (N==0) { return; }
607 const int GRID = (N+BLCK-1)/BLCK;
608 hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,nullptr,N,d_body);
609 MFEM_GPU_CHECK(hipGetLastError());
610}
611
612template <typename DBODY>
613void HipWrap2D(const int N, DBODY &&d_body,
614 const int X, const int Y, const int BZ)
615{
616 if (N==0) { return; }
617 const int GRID = (N+BZ-1)/BZ;
618 const dim3 BLCK(X,Y,BZ);
619 hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,nullptr,N,d_body);
620 MFEM_GPU_CHECK(hipGetLastError());
621}
622
623template <typename DBODY>
624void HipWrap3D(const int N, DBODY &&d_body,
625 const int X, const int Y, const int Z, const int G)
626{
627 if (N==0) { return; }
628 const int GRID = G == 0 ? N : G;
629 const dim3 BLCK(X,Y,Z);
630 hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,nullptr,N,d_body);
631 MFEM_GPU_CHECK(hipGetLastError());
632}
633
634template <int Dim>
635struct HipWrap;
636
637template <>
638struct HipWrap<1>
639{
640 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
641 static void run(const int N, DBODY &&d_body,
642 const int X, const int Y, const int Z, const int G)
643 {
644 HipWrap1D<BLCK>(N, d_body);
645 }
646};
647
648template <>
649struct HipWrap<2>
650{
651 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
652 static void run(const int N, DBODY &&d_body,
653 const int X, const int Y, const int Z, const int G)
654 {
655 HipWrap2D(N, d_body, X, Y, Z);
656 }
657};
658
659template <>
660struct HipWrap<3>
661{
662 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
663 static void run(const int N, DBODY &&d_body,
664 const int X, const int Y, const int Z, const int G)
665 {
666 HipWrap3D(N, d_body, X, Y, Z, G);
667 }
668};
669
670#endif // MFEM_USE_HIP
671
672
673/// The forall kernel body wrapper
674template <const int DIM, typename d_lambda, typename h_lambda>
675inline void ForallWrap(const bool use_dev, const int N,
676 d_lambda &&d_body, h_lambda &&h_body,
677 const int X=0, const int Y=0, const int Z=0,
678 const int G=0)
679{
680 MFEM_CONTRACT_VAR(X);
681 MFEM_CONTRACT_VAR(Y);
682 MFEM_CONTRACT_VAR(Z);
683 MFEM_CONTRACT_VAR(G);
684 MFEM_CONTRACT_VAR(d_body);
685 if (!use_dev) { goto backend_cpu; }
686
687#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
688 // If Backend::RAJA_CUDA is allowed, use it
690 {
691 return RajaCuWrap<DIM>::run(N, d_body, X, Y, Z, G);
692 }
693#endif
694
695#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
696 // If Backend::RAJA_HIP is allowed, use it
698 {
699 return RajaHipWrap<DIM>::run(N, d_body, X, Y, Z, G);
700 }
701#endif
702
703#ifdef MFEM_USE_CUDA
704 // If Backend::CUDA is allowed, use it
706 {
707 return CuWrap<DIM>::run(N, d_body, X, Y, Z, G);
708 }
709#endif
710
711#ifdef MFEM_USE_HIP
712 // If Backend::HIP is allowed, use it
714 {
715 return HipWrap<DIM>::run(N, d_body, X, Y, Z, G);
716 }
717#endif
718
719 // If Backend::DEBUG_DEVICE is allowed, use it
720 if (Device::Allows(Backend::DEBUG_DEVICE)) { goto backend_cpu; }
721
722#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
723 // If Backend::RAJA_OMP is allowed, use it
724 if (Device::Allows(Backend::RAJA_OMP)) { return RajaOmpWrap(N, h_body); }
725#endif
726
727#ifdef MFEM_USE_OPENMP
728 // If Backend::OMP is allowed, use it
729 if (Device::Allows(Backend::OMP)) { return OmpWrap(N, h_body); }
730#endif
731
732#ifdef MFEM_USE_RAJA
733 // If Backend::RAJA_CPU is allowed, use it
734 if (Device::Allows(Backend::RAJA_CPU)) { return RajaSeqWrap(N, h_body); }
735#endif
736
737backend_cpu:
738 // Handle Backend::CPU. This is also a fallback for any allowed backends not
739 // handled above, e.g. OCCA_CPU with configuration 'occa-cpu,cpu', or
740 // OCCA_OMP with configuration 'occa-omp,cpu'.
741 for (int k = 0; k < N; k++) { h_body(k); }
742}
743
744template <const int DIM, typename lambda>
745inline void ForallWrap(const bool use_dev, const int N, lambda &&body,
746 const int X=0, const int Y=0, const int Z=0,
747 const int G=0)
748{
749 ForallWrap<DIM>(use_dev, N, body, body, X, Y, Z, G);
750}
751
752template<typename lambda>
753inline void forall(int N, lambda &&body) { ForallWrap<1>(true, N, body); }
754
755template<typename lambda>
756inline void forall_switch(bool use_dev, int N, lambda &&body)
757{
758 ForallWrap<1>(use_dev, N, body);
759}
760
761template<typename lambda>
762inline void forall_2D(int N, int X, int Y, lambda &&body)
763{
764 ForallWrap<2>(true, N, body, X, Y, 1);
765}
766
767template<typename lambda>
768inline void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
769{
770 ForallWrap<2>(true, N, body, X, Y, BZ);
771}
772
773template<typename lambda>
774inline void forall_3D(int N, int X, int Y, int Z, lambda &&body)
775{
776 ForallWrap<3>(true, N, body, X, Y, Z, 0);
777}
778
779template<typename lambda>
780inline void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
781{
782 ForallWrap<3>(true, N, body, X, Y, Z, G);
783}
784
785#ifdef MFEM_USE_MPI
786
787// Function mfem::hypre_forall_cpu() similar to mfem::forall, but it always
788// executes on the CPU using sequential or OpenMP-parallel execution based on
789// the hypre build time configuration.
790template<typename lambda>
791inline void hypre_forall_cpu(int N, lambda &&body)
792{
793#ifdef HYPRE_USING_OPENMP
794 #pragma omp parallel for HYPRE_SMP_SCHEDULE
795#endif
796 for (int i = 0; i < N; i++) { body(i); }
797}
798
799// Function mfem::hypre_forall_gpu() similar to mfem::forall, but it always
800// executes on the GPU device that hypre was configured with at build time.
801#if defined(HYPRE_USING_GPU)
802template<typename lambda>
803inline void hypre_forall_gpu(int N, lambda &&body)
804{
805#if defined(HYPRE_USING_CUDA)
806 CuWrap1D(N, body);
807#elif defined(HYPRE_USING_HIP)
808 HipWrap1D(N, body);
809#else
810#error Unknown HYPRE GPU backend!
811#endif
812}
813#endif
814
815// Function mfem::hypre_forall() similar to mfem::forall, but it executes on the
816// device, CPU or GPU, that hypre was configured with at build time (when the
817// HYPRE version is < 2.31.0) or at runtime (when HYPRE was configured with GPU
818// support at build time and HYPRE's version is >= 2.31.0). This selection is
819// generally independent of what device was selected in MFEM's runtime
820// configuration.
821template<typename lambda>
822inline void hypre_forall(int N, lambda &&body)
823{
824#if !defined(HYPRE_USING_GPU)
825 hypre_forall_cpu(N, body);
826#elif MFEM_HYPRE_VERSION < 23100
827 hypre_forall_gpu(N, body);
828#else // HYPRE_USING_GPU is defined and MFEM_HYPRE_VERSION >= 23100
829 if (!HypreUsingGPU())
830 {
831 hypre_forall_cpu(N, body);
832 }
833 else
834 {
835 hypre_forall_gpu(N, body);
836 }
837#endif
838}
839
840// Return the most general MemoryClass that can be used with mfem::hypre_forall
841// kernels. The returned MemoryClass is the same as the one returned by
842// GerHypreMemoryClass() except when hypre is configured to use UVM, in which
843// case this function returns MemoryClass::HOST or MemoryClass::DEVICE depending
844// on the result of HypreUsingGPU().
849
850#endif // MFEM_USE_MPI
851
852namespace internal
853{
854/**
855 @brief Device portion of a reduction over a 1D sequence [0, N)
856 @tparam B Reduction body. Must be callable with the signature void(int i, value_type&
857 v), where i is the index to evaluate and v is the value to update.
858 @tparam R Reducer capable of combining values of type value_type. See reducers.hpp for
859 pre-defined reducers.
860 */
861template<class B, class R> struct reduction_kernel
862{
863 /// value type body and reducer operate on.
864 using value_type = typename R::value_type;
865 /// workspace for the intermediate reduction results
866 mutable value_type *work;
867 B body;
868 R reducer;
869 /// Length of sequence to reduce over.
870 int N;
871 /// How many items is each thread responsible for during the serial phase
872 int items_per_thread;
873
874 constexpr static MFEM_HOST_DEVICE int max_blocksize() { return 256; }
875
876 /// helper for computing the reduction block size
877 static int block_log2(unsigned N)
878 {
879#if defined(__GNUC__) or defined(__clang__)
880 return N ? (sizeof(unsigned) * 8 - __builtin_clz(N)) : 0;
881#elif defined(_MSC_VER)
882 return sizeof(unsigned) * 8 - __lzclz(N);
883#else
884 int res = 0;
885 while (N)
886 {
887 N >>= 1;
888 ++res;
889 }
890 return res;
891#endif
892 }
893
894 MFEM_HOST_DEVICE void operator()(int work_idx) const
895 {
896 MFEM_SHARED value_type buffer[max_blocksize()];
897 reducer.SetInitialValue(buffer[MFEM_THREAD_ID(x)]);
898 // serial part
899 for (int idx = 0; idx < items_per_thread; ++idx)
900 {
901 int i = MFEM_THREAD_ID(x) +
902 (idx + work_idx * items_per_thread) * MFEM_THREAD_SIZE(x);
903 if (i < N)
904 {
905 body(i, buffer[MFEM_THREAD_ID(x)]);
906 }
907 else
908 {
909 break;
910 }
911 }
912 // binary tree reduction
913 for (int i = (MFEM_THREAD_SIZE(x) >> 1); i > 0; i >>= 1)
914 {
915 MFEM_SYNC_THREAD;
916 if (MFEM_THREAD_ID(x) < i)
917 {
918 reducer.Join(buffer[MFEM_THREAD_ID(x)], buffer[MFEM_THREAD_ID(x) + i]);
919 }
920 }
921 if (MFEM_THREAD_ID(x) == 0)
922 {
923 work[work_idx] = buffer[0];
924 }
925 }
926};
927}
928
929/**
930 @brief Performs a 1D reduction on the range [0,N).
931 @a res initial value and where the result will be written.
932 @a body reduction function body.
933 @a reducer helper for joining two reduced values.
934 @a use_dev true to perform the reduction on the device, if possible.
935 @a workspace temporary workspace used for device reductions. May be resized to
936 a larger capacity as needed. Preferably should have MemoryType::MANAGED or
937 MemoryType::HOST_PINNED. TODO: replace with internal temporary workspace
938 vectors once that's added to the memory manager.
939 @tparam T value_type to operate on
940 */
941template <class T, class B, class R>
942void reduce(int N, T &res, B &&body, const R &reducer, bool use_dev,
943 Array<T> &workspace)
944{
945 if (N == 0)
946 {
947 return;
948 }
949
950#if defined(MFEM_USE_HIP) || defined(MFEM_USE_CUDA)
951 if (use_dev &&
954 {
955 using red_type = internal::reduction_kernel<typename std::decay<B>::type,
956 typename std::decay<R>::type>;
957 // max block size is 256, but can be smaller
958 int block_size = std::min<int>(red_type::max_blocksize(),
959 1ll << red_type::block_log2(N));
960
962#if defined(MFEM_USE_CUDA)
963 // good value of mp_sat found experimentally on Lassen
964 constexpr int mp_sat = 8;
965#elif defined(MFEM_USE_HIP)
966 // good value of mp_sat found experimentally on Tuolumne
967 constexpr int mp_sat = 4;
968#else
969 num_mp = 1;
970 constexpr int mp_sat = 1;
971#endif
972 // determine how many items each thread should sum during the serial
973 // portion
974 int nblocks = std::min(mp_sat * num_mp, (N + block_size - 1) / block_size);
975 int items_per_thread =
976 (N + block_size * nblocks - 1) / (block_size * nblocks);
977
978 red_type red{nullptr, std::forward<B>(body), reducer, N, items_per_thread};
979 // allocate res to fit block_size entries
980 auto mt = workspace.GetMemory().GetMemoryType();
982 {
984 }
985 workspace.SetSize(nblocks, mt);
986 auto work = workspace.HostWrite();
987 red.work = work;
988 forall_2D(nblocks, block_size, 1, std::move(red));
989 // wait for results
990 MFEM_DEVICE_SYNC;
991 for (int i = 0; i < nblocks; ++i)
992 {
993 reducer.Join(res, work[i]);
994 }
995 return;
996 }
997#endif
998
999 for (int i = 0; i < N; ++i)
1000 {
1001 body(i, res);
1002 }
1003}
1004
1005} // namespace mfem
1006
1007#endif // MFEM_FORALL_HPP
Memory< T > & GetMemory()
Return a reference to the Memory object used by the Array.
Definition array.hpp:126
void SetSize(int nsize)
Change the logical size of the array, keep existing entries.
Definition array.hpp:758
T * HostWrite()
Shortcut for mfem::Write(a.GetMemory(), a.Size(), false).
Definition array.hpp:349
static int NumMultiprocessors()
Same as NumMultiprocessors(int), for the currently active device.
Definition device.cpp:714
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition device.hpp:259
static int GetId()
Get the device id of the configured device.
Definition device.hpp:253
RAJA::LaunchPolicy< RAJA::cuda_launch_t< true > > cuda_launch_policy
RAJA Cuda and Hip backends.
Definition forall.hpp:230
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
Definition forall.hpp:451
RAJA::LaunchPolicy< RAJA::hip_launch_t< true > > hip_launch_policy
Definition forall.hpp:239
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Definition forall.hpp:461
MemoryClass GetHypreForallMemoryClass()
Definition forall.hpp:845
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:356
void hypre_forall_cpu(int N, lambda &&body)
Definition forall.hpp:791
void reduce(int N, T &res, B &&body, const R &reducer, bool use_dev, Array< T > &workspace)
Performs a 1D reduction on the range [0,N). res initial value and where the result will be written....
Definition forall.hpp:942
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:256
RAJA::LoopPolicy< RAJA::cuda_thread_z_direct > cuda_threads_z
Definition forall.hpp:234
void CuWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:508
MemoryClass
Memory classes identify sets of memory types.
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:290
void RajaHipWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:349
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
Definition forall.hpp:768
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:517
void hypre_forall_gpu(int N, lambda &&body)
Definition forall.hpp:803
internal::DofQuadLimits_CUDA DofQuadLimits
Maximum number of 1D DOFs or quadrature points for the architecture currently being compiled for (use...
Definition forall.hpp:100
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
Definition forall.hpp:675
void forall_2D(int N, int X, int Y, lambda &&body)
Definition forall.hpp:762
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:624
RAJA::LoopPolicy< RAJA::hip_thread_z_direct > hip_threads_z
Definition forall.hpp:243
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:529
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:613
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition forall.hpp:774
void hypre_forall(int N, lambda &&body)
Definition forall.hpp:822
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.
Definition forall.hpp:212
bool HypreUsingGPU()
Return true if HYPRE is configured to use GPU.
void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
Definition forall.hpp:780
RAJA::LoopPolicy< RAJA::hip_block_x_direct > hip_teams_x
Definition forall.hpp:241
void RajaCuWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:249
@ HOST_PINNED
Host memory: pinned (page-locked)
void HipWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:604
void forall(int N, lambda &&body)
Definition forall.hpp:753
void forall_switch(bool use_dev, int N, lambda &&body)
Definition forall.hpp:756
RAJA::LoopPolicy< RAJA::cuda_block_x_direct > cuda_teams_x
Definition forall.hpp:232
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:390
struct s_NavierContext ctx
@ RAJA_OMP
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES.
Definition device.hpp:46
@ RAJA_CUDA
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
Definition device.hpp:49
@ DEBUG_DEVICE
[device] Debug backend: host memory is READ/WRITE protected while a device is in use....
Definition device.hpp:76
@ RAJA_CPU
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES.
Definition device.hpp:43
@ OMP
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
Definition device.hpp:36
@ HIP
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
Definition device.hpp:40
@ RAJA_HIP
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
Definition device.hpp:52
@ CUDA
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
Definition device.hpp:38
@ HIP_MASK
Biwise-OR of all HIP backends.
Definition device.hpp:91
@ CUDA_MASK
Biwise-OR of all CUDA backends.
Definition device.hpp:89
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:546
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:557
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:568
Maximum number of 1D DOFs or quadrature points for the current runtime configuration of the Device (u...
Definition forall.hpp:117
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition forall.hpp:128
int HCURL_MAX_D1D
Maximum number of 1D nodal points for H(curl).
Definition forall.hpp:120
int HCURL_MAX_Q1D
Maximum number of 1D quadrature points for H(curl).
Definition forall.hpp:121
int HDIV_MAX_Q1D
Maximum number of 1D quadrature points for H(div).
Definition forall.hpp:123
int MAX_INTERP_1D
Maximum number of points for use in QuadratureInterpolator.
Definition forall.hpp:124
int HDIV_MAX_D1D
Maximum number of 1D nodal points for H(div).
Definition forall.hpp:122
int MAX_DET_1D
Maximum number of points for determinant computation in QuadratureInterpolator.
Definition forall.hpp:125
int MAX_D1D
Maximum number of 1D nodal points.
Definition forall.hpp:118
int MAX_Q1D
Maximum number of 1D quadrature points.
Definition forall.hpp:119
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:641
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:652
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:663
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:316
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:327
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:338
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:416
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:427
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:438