MFEM v4.7.0
Finite element discretization library
Loading...
Searching...
No Matches
forall.hpp
Go to the documentation of this file.
1// Copyright (c) 2010-2024, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#ifndef MFEM_FORALL_HPP
13#define MFEM_FORALL_HPP
14
15#include "../config/config.hpp"
16#include "annotation.hpp"
17#include "error.hpp"
18#include "backends.hpp"
19#include "device.hpp"
20#include "mem_manager.hpp"
21#include "../linalg/dtensor.hpp"
22#ifdef MFEM_USE_MPI
23#include <_hypre_utilities.h>
24#endif
25
26namespace mfem
27{
28
29// The following DofQuadLimit_ structs define the maximum values of D1D and Q1D
30// often used in the "fallback kernels" for partial assembly. Different limits
31// take effect for different architectures. The limits should be queried using
32// the public interface in DeviceDofQuadLimits or DofQuadLimits, and generally
33// not be directly accessing the structs defined below.
34//
35// In host code, the limits associated with the currently configured Device can
36// be accessed using DeviceDofQuadLimits::Get().
37//
38// In mfem::forall kernels or MFEM_HOST_DEVICE functions, the limits
39// corresponding to the architecture the function is being compiled for can be
40// accessed as static constexpr variables using the type alias DofQuadLimits.
41
42namespace internal
43{
44
45struct DofQuadLimits_CUDA
46{
47 static constexpr int MAX_D1D = 14;
48 static constexpr int MAX_Q1D = 14;
49 static constexpr int HCURL_MAX_D1D = 5;
50 static constexpr int HCURL_MAX_Q1D = 6;
51 static constexpr int HDIV_MAX_D1D = 5;
52 static constexpr int HDIV_MAX_Q1D = 6;
53 static constexpr int MAX_INTERP_1D = 8;
54 static constexpr int MAX_DET_1D = 6;
55};
56
57struct DofQuadLimits_HIP
58{
59 static constexpr int MAX_D1D = 10;
60 static constexpr int MAX_Q1D = 10;
61 static constexpr int HCURL_MAX_D1D = 5;
62 static constexpr int HCURL_MAX_Q1D = 5;
63 static constexpr int HDIV_MAX_D1D = 5;
64 static constexpr int HDIV_MAX_Q1D = 6;
65 static constexpr int MAX_INTERP_1D = 8;
66 static constexpr int MAX_DET_1D = 6;
67};
68
69struct DofQuadLimits_CPU
70{
71#ifndef _WIN32
72 static constexpr int MAX_D1D = 24;
73 static constexpr int MAX_Q1D = 24;
74#else
75 static constexpr int MAX_D1D = 14;
76 static constexpr int MAX_Q1D = 14;
77#endif
78 static constexpr int HCURL_MAX_D1D = 10;
79 static constexpr int HCURL_MAX_Q1D = 10;
80 static constexpr int HDIV_MAX_D1D = 10;
81 static constexpr int HDIV_MAX_Q1D = 10;
82 static constexpr int MAX_INTERP_1D = MAX_D1D;
83 static constexpr int MAX_DET_1D = MAX_D1D;
84};
85
86} // namespace internal
87
88/// @brief Maximum number of 1D DOFs or quadrature points for the architecture
89/// currently being compiled for (used in fallback kernels).
90///
91/// DofQuadLimits provides access to the limits as static constexpr member
92/// variables for use in mfem::forall kernels or MFEM_HOST_DEVICE functions.
93///
94/// @sa For accessing the limits according to the runtime configuration of the
95/// Device, see DeviceDofQuadLimits.
96#if defined(__CUDA_ARCH__)
97using DofQuadLimits = internal::DofQuadLimits_CUDA;
98#elif defined(__HIP_DEVICE_COMPILE__)
99using DofQuadLimits = internal::DofQuadLimits_HIP;
100#else
101using DofQuadLimits = internal::DofQuadLimits_CPU;
102#endif
103
104/// @brief Maximum number of 1D DOFs or quadrature points for the current
105/// runtime configuration of the Device (used in fallback kernels).
106///
107/// DeviceDofQuadLimits can be used in host code to query the limits for the
108/// configured device (e.g. when the user has selected GPU execution at
109/// runtime).
110///
111/// @sa For accessing the limits according to the current compiler pass, see
112/// DofQuadLimits.
114{
115 int MAX_D1D; ///< Maximum number of 1D nodal points.
116 int MAX_Q1D; ///< Maximum number of 1D quadrature points.
117 int HCURL_MAX_D1D; ///< Maximum number of 1D nodal points for H(curl).
118 int HCURL_MAX_Q1D; ///< Maximum number of 1D quadrature points for H(curl).
119 int HDIV_MAX_D1D; ///< Maximum number of 1D nodal points for H(div).
120 int HDIV_MAX_Q1D; ///< Maximum number of 1D quadrature points for H(div).
121 int MAX_INTERP_1D; ///< Maximum number of points for use in QuadratureInterpolator.
122 int MAX_DET_1D; ///< Maximum number of points for determinant computation in QuadratureInterpolator.
123
124 /// Return a const reference to the DeviceDofQuadLimits singleton.
125 static const DeviceDofQuadLimits &Get()
126 {
127 static const DeviceDofQuadLimits dof_quad_limits;
128 return dof_quad_limits;
129 }
130
131private:
132 /// Initialize the limits depending on the configuration of the Device.
134 {
135 if (Device::Allows(Backend::CUDA_MASK)) { Populate<internal::DofQuadLimits_CUDA>(); }
136 else if (Device::Allows(Backend::HIP_MASK)) { Populate<internal::DofQuadLimits_HIP>(); }
137 else { Populate<internal::DofQuadLimits_CPU>(); }
138 }
139
140 /// @brief Set the limits using the static members of the type @a T.
141 ///
142 /// @a T should be one of DofQuadLimits_CUDA, DofQuadLimits_HIP, or
143 /// DofQuadLimits_CPU.
144 template <typename T> void Populate()
145 {
146 MAX_D1D = T::MAX_D1D;
147 MAX_Q1D = T::MAX_Q1D;
148 HCURL_MAX_D1D = T::HCURL_MAX_D1D;
149 HCURL_MAX_Q1D = T::HCURL_MAX_Q1D;
150 HDIV_MAX_D1D = T::HDIV_MAX_D1D;
151 HDIV_MAX_Q1D = T::HDIV_MAX_Q1D;
152 MAX_INTERP_1D = T::MAX_INTERP_1D;
153 MAX_DET_1D = T::MAX_DET_1D;
154 }
155};
156
157// MFEM pragma macros that can be used inside MFEM_FORALL macros.
158#define MFEM_PRAGMA(X) _Pragma(#X)
159
160// MFEM_UNROLL pragma macro that can be used inside MFEM_FORALL macros.
161#if defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__)
162#define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N))
163#else
164#define MFEM_UNROLL(N)
165#endif
166
167// MFEM_GPU_FORALL: "parallel for" executed with CUDA or HIP based on the MFEM
168// build-time configuration (MFEM_USE_CUDA or MFEM_USE_HIP). If neither CUDA nor
169// HIP is enabled, this macro is a no-op.
170#if defined(MFEM_USE_CUDA)
171#define MFEM_GPU_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE \
172 (int i) {__VA_ARGS__})
173#elif defined(MFEM_USE_HIP)
174#define MFEM_GPU_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE \
175 (int i) {__VA_ARGS__})
176#else
177#define MFEM_GPU_FORALL(i, N,...) do { } while (false)
178#endif
179
180// Implementation of MFEM's "parallel for" (forall) device/host kernel
181// interfaces supporting RAJA, CUDA, OpenMP, and sequential backends.
182
183// The MFEM_FORALL wrapper
184#define MFEM_FORALL(i,N,...) \
185 ForallWrap<1>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})
186
187// MFEM_FORALL with a 2D CUDA block
188#define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
189 ForallWrap<2>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,BZ)
190
191// MFEM_FORALL with a 3D CUDA block
192#define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
193 ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z)
194
195// MFEM_FORALL with a 3D CUDA block and grid
196// With G=0, this is the same as MFEM_FORALL_3D(i,N,X,Y,Z,...)
197#define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \
198 ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z,G)
199
200// MFEM_FORALL that uses the basic CPU backend when use_dev is false. See for
201// example the functions in vector.cpp, where we don't want to use the mfem
202// device for operations on small vectors.
203#define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
204 ForallWrap<1>(use_dev,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})
205
206
207/// OpenMP backend
208template <typename HBODY>
209void OmpWrap(const int N, HBODY &&h_body)
210{
211#ifdef MFEM_USE_OPENMP
212 #pragma omp parallel for
213 for (int k = 0; k < N; k++)
214 {
215 h_body(k);
216 }
217#else
218 MFEM_CONTRACT_VAR(N);
219 MFEM_CONTRACT_VAR(h_body);
220 MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");
221#endif
222}
223
224
225/// RAJA Cuda and Hip backends
226#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
228 RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;
230 RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
232 RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>;
233#endif
234
235#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
237 RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>;
239 RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
241 RAJA::LoopPolicy<RAJA::hip_thread_z_direct>;
242#endif
243
244#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
245template <const int BLOCKS = MFEM_CUDA_BLOCKS, typename DBODY>
246void RajaCuWrap1D(const int N, DBODY &&d_body)
247{
248 //true denotes asynchronous kernel
249 RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
250}
251
252template <typename DBODY>
253void RajaCuWrap2D(const int N, DBODY &&d_body,
254 const int X, const int Y, const int BZ)
255{
256 MFEM_VERIFY(N>0, "");
257 MFEM_VERIFY(BZ>0, "");
258 const int G = (N+BZ-1)/BZ;
259
260 using namespace RAJA;
261 using RAJA::RangeSegment;
262
263 launch<cuda_launch_policy>
264 (LaunchParams(Teams(G), Threads(X, Y, BZ)),
265 [=] RAJA_DEVICE (LaunchContext ctx)
266 {
267
268 loop<cuda_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
269 {
270
271 loop<cuda_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
272 {
273
274 const int k = n*BZ + tz;
275 if (k >= N) { return; }
276 d_body(k);
277
278 });
279
280 });
281
282 });
283
284 MFEM_GPU_CHECK(cudaGetLastError());
285}
286
287template <typename DBODY>
288void RajaCuWrap3D(const int N, DBODY &&d_body,
289 const int X, const int Y, const int Z, const int G)
290{
291 MFEM_VERIFY(N>0, "");
292 const int GRID = G == 0 ? N : G;
293 using namespace RAJA;
294 using RAJA::RangeSegment;
295
296 launch<cuda_launch_policy>
297 (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
298 [=] RAJA_DEVICE (LaunchContext ctx)
299 {
300
301 loop<cuda_teams_x>(ctx, RangeSegment(0, N), d_body);
302
303 });
304
305 MFEM_GPU_CHECK(cudaGetLastError());
306}
307
308template <int Dim>
310
311template <>
312struct RajaCuWrap<1>
313{
314 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
315 static void run(const int N, DBODY &&d_body,
316 const int X, const int Y, const int Z, const int G)
317 {
318 RajaCuWrap1D<BLCK>(N, d_body);
319 }
320};
321
322template <>
323struct RajaCuWrap<2>
324{
325 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
326 static void run(const int N, DBODY &&d_body,
327 const int X, const int Y, const int Z, const int G)
328 {
329 RajaCuWrap2D(N, d_body, X, Y, Z);
330 }
331};
332
333template <>
334struct RajaCuWrap<3>
335{
336 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
337 static void run(const int N, DBODY &&d_body,
338 const int X, const int Y, const int Z, const int G)
339 {
340 RajaCuWrap3D(N, d_body, X, Y, Z, G);
341 }
342};
343
344#endif
345
346#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
347template <const int BLOCKS = MFEM_HIP_BLOCKS, typename DBODY>
348void RajaHipWrap1D(const int N, DBODY &&d_body)
349{
350 //true denotes asynchronous kernel
351 RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
352}
353
354template <typename DBODY>
355void RajaHipWrap2D(const int N, DBODY &&d_body,
356 const int X, const int Y, const int BZ)
357{
358 MFEM_VERIFY(N>0, "");
359 MFEM_VERIFY(BZ>0, "");
360 const int G = (N+BZ-1)/BZ;
361
362 using namespace RAJA;
363 using RAJA::RangeSegment;
364
365 launch<hip_launch_policy>
366 (LaunchParams(Teams(G), Threads(X, Y, BZ)),
367 [=] RAJA_DEVICE (LaunchContext ctx)
368 {
369
370 loop<hip_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
371 {
372
373 loop<hip_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
374 {
375
376 const int k = n*BZ + tz;
377 if (k >= N) { return; }
378 d_body(k);
379
380 });
381
382 });
383
384 });
385
386 MFEM_GPU_CHECK(hipGetLastError());
387}
388
389template <typename DBODY>
390void RajaHipWrap3D(const int N, DBODY &&d_body,
391 const int X, const int Y, const int Z, const int G)
392{
393 MFEM_VERIFY(N>0, "");
394 const int GRID = G == 0 ? N : G;
395 using namespace RAJA;
396 using RAJA::RangeSegment;
397
398 launch<hip_launch_policy>
399 (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
400 [=] RAJA_DEVICE (LaunchContext ctx)
401 {
402
403 loop<hip_teams_x>(ctx, RangeSegment(0, N), d_body);
404
405 });
406
407 MFEM_GPU_CHECK(hipGetLastError());
408}
409
410template <int Dim>
412
413template <>
414struct RajaHipWrap<1>
415{
416 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
417 static void run(const int N, DBODY &&d_body,
418 const int X, const int Y, const int Z, const int G)
419 {
420 RajaHipWrap1D<BLCK>(N, d_body);
421 }
422};
423
424template <>
425struct RajaHipWrap<2>
426{
427 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
428 static void run(const int N, DBODY &&d_body,
429 const int X, const int Y, const int Z, const int G)
430 {
431 RajaHipWrap2D(N, d_body, X, Y, Z);
432 }
433};
434
435template <>
436struct RajaHipWrap<3>
437{
438 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
439 static void run(const int N, DBODY &&d_body,
440 const int X, const int Y, const int Z, const int G)
441 {
442 RajaHipWrap3D(N, d_body, X, Y, Z, G);
443 }
444};
445
446#endif
447
448/// RAJA OpenMP backend
449#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
450
451template <typename HBODY>
452void RajaOmpWrap(const int N, HBODY &&h_body)
453{
454 RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
455}
456
457#endif
458
459
460/// RAJA sequential loop backend
461template <typename HBODY>
462void RajaSeqWrap(const int N, HBODY &&h_body)
463{
464#ifdef MFEM_USE_RAJA
465
466#if (RAJA_VERSION_MAJOR >= 2023)
467 //loop_exec was marked deprecated in RAJA version 2023.06.0
468 //and will be removed. We now use seq_exec.
469 using raja_forall_pol = RAJA::seq_exec;
470#else
471 using raja_forall_pol = RAJA::loop_exec;
472#endif
473
474 RAJA::forall<raja_forall_pol>(RAJA::RangeSegment(0,N), h_body);
475#else
476 MFEM_CONTRACT_VAR(N);
477 MFEM_CONTRACT_VAR(h_body);
478 MFEM_ABORT("RAJA requested but RAJA is not enabled!");
479#endif
480}
481
482
483/// CUDA backend
484#ifdef MFEM_USE_CUDA
485
486template <typename BODY> __global__ static
487void CuKernel1D(const int N, BODY body)
488{
489 const int k = blockDim.x*blockIdx.x + threadIdx.x;
490 if (k >= N) { return; }
491 body(k);
492}
493
494template <typename BODY> __global__ static
495void CuKernel2D(const int N, BODY body)
496{
497 const int k = blockIdx.x*blockDim.z + threadIdx.z;
498 if (k >= N) { return; }
499 body(k);
500}
501
502template <typename BODY> __global__ static
503void CuKernel3D(const int N, BODY body)
504{
505 for (int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }
506}
507
508template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
509void CuWrap1D(const int N, DBODY &&d_body)
510{
511 if (N==0) { return; }
512 const int GRID = (N+BLCK-1)/BLCK;
513 CuKernel1D<<<GRID,BLCK>>>(N, d_body);
514 MFEM_GPU_CHECK(cudaGetLastError());
515}
516
517template <typename DBODY>
518void CuWrap2D(const int N, DBODY &&d_body,
519 const int X, const int Y, const int BZ)
520{
521 if (N==0) { return; }
522 MFEM_VERIFY(BZ>0, "");
523 const int GRID = (N+BZ-1)/BZ;
524 const dim3 BLCK(X,Y,BZ);
525 CuKernel2D<<<GRID,BLCK>>>(N,d_body);
526 MFEM_GPU_CHECK(cudaGetLastError());
527}
528
529template <typename DBODY>
530void CuWrap3D(const int N, DBODY &&d_body,
531 const int X, const int Y, const int Z, const int G)
532{
533 if (N==0) { return; }
534 const int GRID = G == 0 ? N : G;
535 const dim3 BLCK(X,Y,Z);
536 CuKernel3D<<<GRID,BLCK>>>(N,d_body);
537 MFEM_GPU_CHECK(cudaGetLastError());
538}
539
540template <int Dim>
541struct CuWrap;
542
543template <>
544struct CuWrap<1>
545{
546 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
547 static void run(const int N, DBODY &&d_body,
548 const int X, const int Y, const int Z, const int G)
549 {
550 CuWrap1D<BLCK>(N, d_body);
551 }
552};
553
554template <>
555struct CuWrap<2>
556{
557 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
558 static void run(const int N, DBODY &&d_body,
559 const int X, const int Y, const int Z, const int G)
560 {
561 CuWrap2D(N, d_body, X, Y, Z);
562 }
563};
564
565template <>
566struct CuWrap<3>
567{
568 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
569 static void run(const int N, DBODY &&d_body,
570 const int X, const int Y, const int Z, const int G)
571 {
572 CuWrap3D(N, d_body, X, Y, Z, G);
573 }
574};
575
576#endif // MFEM_USE_CUDA
577
578
579/// HIP backend
580#ifdef MFEM_USE_HIP
581
582template <typename BODY> __global__ static
583void HipKernel1D(const int N, BODY body)
584{
585 const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
586 if (k >= N) { return; }
587 body(k);
588}
589
590template <typename BODY> __global__ static
591void HipKernel2D(const int N, BODY body)
592{
593 const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;
594 if (k >= N) { return; }
595 body(k);
596}
597
598template <typename BODY> __global__ static
599void HipKernel3D(const int N, BODY body)
600{
601 for (int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }
602}
603
604template <const int BLCK = MFEM_HIP_BLOCKS, typename DBODY>
605void HipWrap1D(const int N, DBODY &&d_body)
606{
607 if (N==0) { return; }
608 const int GRID = (N+BLCK-1)/BLCK;
609 hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
610 MFEM_GPU_CHECK(hipGetLastError());
611}
612
613template <typename DBODY>
614void HipWrap2D(const int N, DBODY &&d_body,
615 const int X, const int Y, const int BZ)
616{
617 if (N==0) { return; }
618 const int GRID = (N+BZ-1)/BZ;
619 const dim3 BLCK(X,Y,BZ);
620 hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body);
621 MFEM_GPU_CHECK(hipGetLastError());
622}
623
624template <typename DBODY>
625void HipWrap3D(const int N, DBODY &&d_body,
626 const int X, const int Y, const int Z, const int G)
627{
628 if (N==0) { return; }
629 const int GRID = G == 0 ? N : G;
630 const dim3 BLCK(X,Y,Z);
631 hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
632 MFEM_GPU_CHECK(hipGetLastError());
633}
634
635template <int Dim>
636struct HipWrap;
637
638template <>
639struct HipWrap<1>
640{
641 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
642 static void run(const int N, DBODY &&d_body,
643 const int X, const int Y, const int Z, const int G)
644 {
645 HipWrap1D<BLCK>(N, d_body);
646 }
647};
648
649template <>
650struct HipWrap<2>
651{
652 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
653 static void run(const int N, DBODY &&d_body,
654 const int X, const int Y, const int Z, const int G)
655 {
656 HipWrap2D(N, d_body, X, Y, Z);
657 }
658};
659
660template <>
661struct HipWrap<3>
662{
663 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
664 static void run(const int N, DBODY &&d_body,
665 const int X, const int Y, const int Z, const int G)
666 {
667 HipWrap3D(N, d_body, X, Y, Z, G);
668 }
669};
670
671#endif // MFEM_USE_HIP
672
673
674/// The forall kernel body wrapper
675template <const int DIM, typename d_lambda, typename h_lambda>
676inline void ForallWrap(const bool use_dev, const int N,
677 d_lambda &&d_body, h_lambda &&h_body,
678 const int X=0, const int Y=0, const int Z=0,
679 const int G=0)
680{
681 MFEM_CONTRACT_VAR(X);
682 MFEM_CONTRACT_VAR(Y);
683 MFEM_CONTRACT_VAR(Z);
684 MFEM_CONTRACT_VAR(G);
685 MFEM_CONTRACT_VAR(d_body);
686 if (!use_dev) { goto backend_cpu; }
687
688#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
689 // If Backend::RAJA_CUDA is allowed, use it
691 {
692 return RajaCuWrap<DIM>::run(N, d_body, X, Y, Z, G);
693 }
694#endif
695
696#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
697 // If Backend::RAJA_HIP is allowed, use it
699 {
700 return RajaHipWrap<DIM>::run(N, d_body, X, Y, Z, G);
701 }
702#endif
703
704#ifdef MFEM_USE_CUDA
705 // If Backend::CUDA is allowed, use it
707 {
708 return CuWrap<DIM>::run(N, d_body, X, Y, Z, G);
709 }
710#endif
711
712#ifdef MFEM_USE_HIP
713 // If Backend::HIP is allowed, use it
715 {
716 return HipWrap<DIM>::run(N, d_body, X, Y, Z, G);
717 }
718#endif
719
720 // If Backend::DEBUG_DEVICE is allowed, use it
721 if (Device::Allows(Backend::DEBUG_DEVICE)) { goto backend_cpu; }
722
723#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
724 // If Backend::RAJA_OMP is allowed, use it
725 if (Device::Allows(Backend::RAJA_OMP)) { return RajaOmpWrap(N, h_body); }
726#endif
727
728#ifdef MFEM_USE_OPENMP
729 // If Backend::OMP is allowed, use it
730 if (Device::Allows(Backend::OMP)) { return OmpWrap(N, h_body); }
731#endif
732
733#ifdef MFEM_USE_RAJA
734 // If Backend::RAJA_CPU is allowed, use it
735 if (Device::Allows(Backend::RAJA_CPU)) { return RajaSeqWrap(N, h_body); }
736#endif
737
738backend_cpu:
739 // Handle Backend::CPU. This is also a fallback for any allowed backends not
740 // handled above, e.g. OCCA_CPU with configuration 'occa-cpu,cpu', or
741 // OCCA_OMP with configuration 'occa-omp,cpu'.
742 for (int k = 0; k < N; k++) { h_body(k); }
743}
744
745template <const int DIM, typename lambda>
746inline void ForallWrap(const bool use_dev, const int N, lambda &&body,
747 const int X=0, const int Y=0, const int Z=0,
748 const int G=0)
749{
750 ForallWrap<DIM>(use_dev, N, body, body, X, Y, Z, G);
751}
752
753template<typename lambda>
754inline void forall(int N, lambda &&body) { ForallWrap<1>(true, N, body); }
755
756template<typename lambda>
757inline void forall_switch(bool use_dev, int N, lambda &&body)
758{
759 ForallWrap<1>(use_dev, N, body);
760}
761
762template<typename lambda>
763inline void forall_2D(int N, int X, int Y, lambda &&body)
764{
765 ForallWrap<2>(true, N, body, X, Y, 1);
766}
767
768template<typename lambda>
769inline void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
770{
771 ForallWrap<2>(true, N, body, X, Y, BZ);
772}
773
774template<typename lambda>
775inline void forall_3D(int N, int X, int Y, int Z, lambda &&body)
776{
777 ForallWrap<3>(true, N, body, X, Y, Z, 0);
778}
779
780template<typename lambda>
781inline void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
782{
783 ForallWrap<3>(true, N, body, X, Y, Z, G);
784}
785
786#ifdef MFEM_USE_MPI
787
788// Function mfem::hypre_forall_cpu() similar to mfem::forall, but it always
789// executes on the CPU using sequential or OpenMP-parallel execution based on
790// the hypre build time configuration.
791template<typename lambda>
792inline void hypre_forall_cpu(int N, lambda &&body)
793{
794#ifdef HYPRE_USING_OPENMP
795 #pragma omp parallel for HYPRE_SMP_SCHEDULE
796#endif
797 for (int i = 0; i < N; i++) { body(i); }
798}
799
800// Function mfem::hypre_forall_gpu() similar to mfem::forall, but it always
801// executes on the GPU device that hypre was configured with at build time.
802#if defined(HYPRE_USING_GPU)
803template<typename lambda>
804inline void hypre_forall_gpu(int N, lambda &&body)
805{
806#if defined(HYPRE_USING_CUDA)
807 CuWrap1D(N, body);
808#elif defined(HYPRE_USING_HIP)
809 HipWrap1D(N, body);
810#else
811#error Unknown HYPRE GPU backend!
812#endif
813}
814#endif
815
816// Function mfem::hypre_forall() similar to mfem::forall, but it executes on the
817// device, CPU or GPU, that hypre was configured with at build time (when the
818// HYPRE version is < 2.31.0) or at runtime (when HYPRE was configured with GPU
819// support at build time and HYPRE's version is >= 2.31.0). This selection is
820// generally independent of what device was selected in MFEM's runtime
821// configuration.
822template<typename lambda>
823inline void hypre_forall(int N, lambda &&body)
824{
825#if !defined(HYPRE_USING_GPU)
826 hypre_forall_cpu(N, body);
827#elif MFEM_HYPRE_VERSION < 23100
828 hypre_forall_gpu(N, body);
829#else // HYPRE_USING_GPU is defined and MFEM_HYPRE_VERSION >= 23100
830 if (!HypreUsingGPU())
831 {
832 hypre_forall_cpu(N, body);
833 }
834 else
835 {
836 hypre_forall_gpu(N, body);
837 }
838#endif
839}
840
841#endif // MFEM_USE_MPI
842
843} // namespace mfem
844
845#endif // MFEM_FORALL_HPP
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition device.hpp:259
RAJA::LaunchPolicy< RAJA::cuda_launch_t< true > > cuda_launch_policy
RAJA Cuda and Hip backends.
Definition forall.hpp:227
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
Definition forall.hpp:452
RAJA::LaunchPolicy< RAJA::hip_launch_t< true > > hip_launch_policy
Definition forall.hpp:236
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Definition forall.hpp:462
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:355
void hypre_forall_cpu(int N, lambda &&body)
Definition forall.hpp:792
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:253
RAJA::LoopPolicy< RAJA::cuda_thread_z_direct > cuda_threads_z
Definition forall.hpp:231
void CuWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:509
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:288
void RajaHipWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:348
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
Definition forall.hpp:769
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:518
void hypre_forall_gpu(int N, lambda &&body)
Definition forall.hpp:804
internal::DofQuadLimits_CUDA DofQuadLimits
Maximum number of 1D DOFs or quadrature points for the architecture currently being compiled for (use...
Definition forall.hpp:97
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
Definition forall.hpp:676
void forall_2D(int N, int X, int Y, lambda &&body)
Definition forall.hpp:763
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:625
RAJA::LoopPolicy< RAJA::hip_thread_z_direct > hip_threads_z
Definition forall.hpp:240
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:530
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:614
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition forall.hpp:775
void hypre_forall(int N, lambda &&body)
Definition forall.hpp:823
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.
Definition forall.hpp:209
bool HypreUsingGPU()
Return true if HYPRE is configured to use GPU.
void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
Definition forall.hpp:781
RAJA::LoopPolicy< RAJA::hip_block_x_direct > hip_teams_x
Definition forall.hpp:238
void RajaCuWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:246
void HipWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:605
void forall(int N, lambda &&body)
Definition forall.hpp:754
void forall_switch(bool use_dev, int N, lambda &&body)
Definition forall.hpp:757
RAJA::LoopPolicy< RAJA::cuda_block_x_direct > cuda_teams_x
Definition forall.hpp:229
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:390
struct s_NavierContext ctx
@ RAJA_OMP
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES.
Definition device.hpp:46
@ RAJA_CUDA
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
Definition device.hpp:49
@ DEBUG_DEVICE
[device] Debug backend: host memory is READ/WRITE protected while a device is in use....
Definition device.hpp:76
@ RAJA_CPU
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES.
Definition device.hpp:43
@ OMP
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
Definition device.hpp:36
@ HIP
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
Definition device.hpp:40
@ RAJA_HIP
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
Definition device.hpp:52
@ CUDA
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
Definition device.hpp:38
@ HIP_MASK
Biwise-OR of all HIP backends.
Definition device.hpp:91
@ CUDA_MASK
Biwise-OR of all CUDA backends.
Definition device.hpp:89
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:547
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:558
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:569
Maximum number of 1D DOFs or quadrature points for the current runtime configuration of the Device (u...
Definition forall.hpp:114
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition forall.hpp:125
int HCURL_MAX_D1D
Maximum number of 1D nodal points for H(curl).
Definition forall.hpp:117
int HCURL_MAX_Q1D
Maximum number of 1D quadrature points for H(curl).
Definition forall.hpp:118
int HDIV_MAX_Q1D
Maximum number of 1D quadrature points for H(div).
Definition forall.hpp:120
int MAX_INTERP_1D
Maximum number of points for use in QuadratureInterpolator.
Definition forall.hpp:121
int HDIV_MAX_D1D
Maximum number of 1D nodal points for H(div).
Definition forall.hpp:119
int MAX_DET_1D
Maximum number of points for determinant computation in QuadratureInterpolator.
Definition forall.hpp:122
int MAX_D1D
Maximum number of 1D nodal points.
Definition forall.hpp:115
int MAX_Q1D
Maximum number of 1D quadrature points.
Definition forall.hpp:116
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:642
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:653
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:664
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:315
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:326
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:337
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:417
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:428
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:439