12#ifndef MFEM_FORALL_HPP
13#define MFEM_FORALL_HPP
23#include <_hypre_utilities.h>
45struct DofQuadLimits_CUDA
47 static constexpr int MAX_D1D = 14;
48 static constexpr int MAX_Q1D = 14;
49 static constexpr int HCURL_MAX_D1D = 5;
50 static constexpr int HCURL_MAX_Q1D = 6;
51 static constexpr int HDIV_MAX_D1D = 5;
52 static constexpr int HDIV_MAX_Q1D = 6;
53 static constexpr int MAX_INTERP_1D = 8;
54 static constexpr int MAX_DET_1D = 6;
57struct DofQuadLimits_HIP
59 static constexpr int MAX_D1D = 10;
60 static constexpr int MAX_Q1D = 10;
61 static constexpr int HCURL_MAX_D1D = 5;
62 static constexpr int HCURL_MAX_Q1D = 5;
63 static constexpr int HDIV_MAX_D1D = 5;
64 static constexpr int HDIV_MAX_Q1D = 6;
65 static constexpr int MAX_INTERP_1D = 8;
66 static constexpr int MAX_DET_1D = 6;
69struct DofQuadLimits_CPU
72 static constexpr int MAX_D1D = 24;
73 static constexpr int MAX_Q1D = 24;
75 static constexpr int MAX_D1D = 14;
76 static constexpr int MAX_Q1D = 14;
78 static constexpr int HCURL_MAX_D1D = 10;
79 static constexpr int HCURL_MAX_Q1D = 10;
80 static constexpr int HDIV_MAX_D1D = 10;
81 static constexpr int HDIV_MAX_Q1D = 10;
82 static constexpr int MAX_INTERP_1D = MAX_D1D;
83 static constexpr int MAX_DET_1D = MAX_D1D;
96#if defined(__CUDA_ARCH__)
98#elif defined(__HIP_DEVICE_COMPILE__)
128 return dof_quad_limits;
137 else { Populate<internal::DofQuadLimits_CPU>(); }
144 template <
typename T>
void Populate()
158#define MFEM_PRAGMA(X) _Pragma(#X)
161#if defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__)
162#define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N))
164#define MFEM_UNROLL(N)
170#if defined(MFEM_USE_CUDA)
171#define MFEM_GPU_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE \
172 (int i) {__VA_ARGS__})
173#elif defined(MFEM_USE_HIP)
174#define MFEM_GPU_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE \
175 (int i) {__VA_ARGS__})
177#define MFEM_GPU_FORALL(i, N,...) do { } while (false)
184#define MFEM_FORALL(i,N,...) \
185 ForallWrap<1>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})
188#define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
189 ForallWrap<2>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,BZ)
192#define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
193 ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z)
197#define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \
198 ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z,G)
203#define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
204 ForallWrap<1>(use_dev,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})
208template <
typename HBODY>
211#ifdef MFEM_USE_OPENMP
212 #pragma omp parallel for
213 for (
int k = 0; k < N; k++)
218 MFEM_CONTRACT_VAR(N);
219 MFEM_CONTRACT_VAR(h_body);
220 MFEM_ABORT(
"OpenMP requested for MFEM but OpenMP is not enabled!");
226#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
228 RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;
230 RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
232 RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>;
235#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
237 RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>;
239 RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
241 RAJA::LoopPolicy<RAJA::hip_thread_z_direct>;
244#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
245template <const
int BLOCKS = MFEM_CUDA_BLOCKS,
typename DBODY>
249 RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
252template <
typename DBODY>
254 const int X,
const int Y,
const int BZ)
256 MFEM_VERIFY(N>0,
"");
257 MFEM_VERIFY(BZ>0,
"");
258 const int G = (N+BZ-1)/BZ;
260 using namespace RAJA;
261 using RAJA::RangeSegment;
263 launch<cuda_launch_policy>
264 (LaunchParams(Teams(G), Threads(X, Y, BZ)),
265 [=] RAJA_DEVICE (LaunchContext
ctx)
268 loop<cuda_teams_x>(
ctx, RangeSegment(0, G), [&] (
const int n)
271 loop<cuda_threads_z>(
ctx, RangeSegment(0, BZ), [&] (
const int tz)
274 const int k = n*BZ + tz;
275 if (k >= N) {
return; }
284 MFEM_GPU_CHECK(cudaGetLastError());
287template <
typename DBODY>
289 const int X,
const int Y,
const int Z,
const int G)
291 MFEM_VERIFY(N>0,
"");
292 const int GRID = G == 0 ? N : G;
293 using namespace RAJA;
294 using RAJA::RangeSegment;
296 launch<cuda_launch_policy>
297 (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
298 [=] RAJA_DEVICE (LaunchContext
ctx)
301 loop<cuda_teams_x>(
ctx, RangeSegment(0, N), d_body);
305 MFEM_GPU_CHECK(cudaGetLastError());
314 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
315 static void run(
const int N, DBODY &&d_body,
316 const int X,
const int Y,
const int Z,
const int G)
325 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
326 static void run(
const int N, DBODY &&d_body,
327 const int X,
const int Y,
const int Z,
const int G)
336 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
337 static void run(
const int N, DBODY &&d_body,
338 const int X,
const int Y,
const int Z,
const int G)
346#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
347template <const
int BLOCKS = MFEM_HIP_BLOCKS,
typename DBODY>
351 RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
354template <
typename DBODY>
356 const int X,
const int Y,
const int BZ)
358 MFEM_VERIFY(N>0,
"");
359 MFEM_VERIFY(BZ>0,
"");
360 const int G = (N+BZ-1)/BZ;
362 using namespace RAJA;
363 using RAJA::RangeSegment;
365 launch<hip_launch_policy>
366 (LaunchParams(Teams(G), Threads(X, Y, BZ)),
367 [=] RAJA_DEVICE (LaunchContext
ctx)
370 loop<hip_teams_x>(
ctx, RangeSegment(0, G), [&] (
const int n)
373 loop<hip_threads_z>(
ctx, RangeSegment(0, BZ), [&] (
const int tz)
376 const int k = n*BZ + tz;
377 if (k >= N) {
return; }
386 MFEM_GPU_CHECK(hipGetLastError());
389template <
typename DBODY>
391 const int X,
const int Y,
const int Z,
const int G)
393 MFEM_VERIFY(N>0,
"");
394 const int GRID = G == 0 ? N : G;
395 using namespace RAJA;
396 using RAJA::RangeSegment;
398 launch<hip_launch_policy>
399 (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
400 [=] RAJA_DEVICE (LaunchContext
ctx)
403 loop<hip_teams_x>(
ctx, RangeSegment(0, N), d_body);
407 MFEM_GPU_CHECK(hipGetLastError());
416 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
417 static void run(
const int N, DBODY &&d_body,
418 const int X,
const int Y,
const int Z,
const int G)
427 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
428 static void run(
const int N, DBODY &&d_body,
429 const int X,
const int Y,
const int Z,
const int G)
438 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
439 static void run(
const int N, DBODY &&d_body,
440 const int X,
const int Y,
const int Z,
const int G)
449#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
451template <
typename HBODY>
454 RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
461template <
typename HBODY>
466#if (RAJA_VERSION_MAJOR >= 2023)
469 using raja_forall_pol = RAJA::seq_exec;
471 using raja_forall_pol = RAJA::loop_exec;
474 RAJA::forall<raja_forall_pol>(RAJA::RangeSegment(0,N), h_body);
476 MFEM_CONTRACT_VAR(N);
477 MFEM_CONTRACT_VAR(h_body);
478 MFEM_ABORT(
"RAJA requested but RAJA is not enabled!");
486template <
typename BODY> __global__
static
487void CuKernel1D(
const int N, BODY body)
489 const int k = blockDim.x*blockIdx.x + threadIdx.x;
490 if (k >= N) {
return; }
494template <
typename BODY> __global__
static
495void CuKernel2D(
const int N, BODY body)
497 const int k = blockIdx.x*blockDim.z + threadIdx.z;
498 if (k >= N) {
return; }
502template <
typename BODY> __global__
static
503void CuKernel3D(
const int N, BODY body)
505 for (
int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }
508template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
511 if (N==0) {
return; }
512 const int GRID = (N+BLCK-1)/BLCK;
513 CuKernel1D<<<GRID,BLCK>>>(N, d_body);
514 MFEM_GPU_CHECK(cudaGetLastError());
517template <
typename DBODY>
519 const int X,
const int Y,
const int BZ)
521 if (N==0) {
return; }
522 MFEM_VERIFY(BZ>0,
"");
523 const int GRID = (N+BZ-1)/BZ;
524 const dim3 BLCK(X,Y,BZ);
525 CuKernel2D<<<GRID,BLCK>>>(N,d_body);
526 MFEM_GPU_CHECK(cudaGetLastError());
529template <
typename DBODY>
531 const int X,
const int Y,
const int Z,
const int G)
533 if (N==0) {
return; }
534 const int GRID = G == 0 ? N : G;
535 const dim3 BLCK(X,Y,Z);
536 CuKernel3D<<<GRID,BLCK>>>(N,d_body);
537 MFEM_GPU_CHECK(cudaGetLastError());
546 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
547 static void run(
const int N, DBODY &&d_body,
548 const int X,
const int Y,
const int Z,
const int G)
557 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
558 static void run(
const int N, DBODY &&d_body,
559 const int X,
const int Y,
const int Z,
const int G)
568 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
569 static void run(
const int N, DBODY &&d_body,
570 const int X,
const int Y,
const int Z,
const int G)
582template <
typename BODY> __global__
static
583void HipKernel1D(
const int N, BODY body)
585 const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
586 if (k >= N) {
return; }
590template <
typename BODY> __global__
static
591void HipKernel2D(
const int N, BODY body)
593 const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;
594 if (k >= N) {
return; }
598template <
typename BODY> __global__
static
599void HipKernel3D(
const int N, BODY body)
601 for (
int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }
604template <const
int BLCK = MFEM_HIP_BLOCKS,
typename DBODY>
607 if (N==0) {
return; }
608 const int GRID = (N+BLCK-1)/BLCK;
609 hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
610 MFEM_GPU_CHECK(hipGetLastError());
613template <
typename DBODY>
615 const int X,
const int Y,
const int BZ)
617 if (N==0) {
return; }
618 const int GRID = (N+BZ-1)/BZ;
619 const dim3 BLCK(X,Y,BZ);
620 hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body);
621 MFEM_GPU_CHECK(hipGetLastError());
624template <
typename DBODY>
626 const int X,
const int Y,
const int Z,
const int G)
628 if (N==0) {
return; }
629 const int GRID = G == 0 ? N : G;
630 const dim3 BLCK(X,Y,Z);
631 hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
632 MFEM_GPU_CHECK(hipGetLastError());
641 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
642 static void run(
const int N, DBODY &&d_body,
643 const int X,
const int Y,
const int Z,
const int G)
652 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
653 static void run(
const int N, DBODY &&d_body,
654 const int X,
const int Y,
const int Z,
const int G)
663 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
664 static void run(
const int N, DBODY &&d_body,
665 const int X,
const int Y,
const int Z,
const int G)
675template <const
int DIM,
typename d_lambda,
typename h_lambda>
677 d_lambda &&d_body, h_lambda &&h_body,
678 const int X=0,
const int Y=0,
const int Z=0,
681 MFEM_CONTRACT_VAR(X);
682 MFEM_CONTRACT_VAR(Y);
683 MFEM_CONTRACT_VAR(Z);
684 MFEM_CONTRACT_VAR(G);
685 MFEM_CONTRACT_VAR(d_body);
686 if (!use_dev) {
goto backend_cpu; }
688#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
696#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
723#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
728#ifdef MFEM_USE_OPENMP
742 for (
int k = 0; k < N; k++) { h_body(k); }
745template <const
int DIM,
typename lambda>
746inline void ForallWrap(
const bool use_dev,
const int N, lambda &&body,
747 const int X=0,
const int Y=0,
const int Z=0,
753template<
typename lambda>
756template<
typename lambda>
762template<
typename lambda>
763inline void forall_2D(
int N,
int X,
int Y, lambda &&body)
768template<
typename lambda>
774template<
typename lambda>
775inline void forall_3D(
int N,
int X,
int Y,
int Z, lambda &&body)
780template<
typename lambda>
791template<
typename lambda>
794#ifdef HYPRE_USING_OPENMP
795 #pragma omp parallel for HYPRE_SMP_SCHEDULE
797 for (
int i = 0; i < N; i++) { body(i); }
802#if defined(HYPRE_USING_GPU)
803template<
typename lambda>
806#if defined(HYPRE_USING_CUDA)
808#elif defined(HYPRE_USING_HIP)
811#error Unknown HYPRE GPU backend!
822template<
typename lambda>
825#if !defined(HYPRE_USING_GPU)
827#elif MFEM_HYPRE_VERSION < 23100
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
RAJA::LaunchPolicy< RAJA::cuda_launch_t< true > > cuda_launch_policy
RAJA Cuda and Hip backends.
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
RAJA::LaunchPolicy< RAJA::hip_launch_t< true > > hip_launch_policy
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void hypre_forall_cpu(int N, lambda &&body)
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
RAJA::LoopPolicy< RAJA::cuda_thread_z_direct > cuda_threads_z
void CuWrap1D(const int N, DBODY &&d_body)
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void RajaHipWrap1D(const int N, DBODY &&d_body)
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void hypre_forall_gpu(int N, lambda &&body)
internal::DofQuadLimits_CUDA DofQuadLimits
Maximum number of 1D DOFs or quadrature points for the architecture currently being compiled for (use...
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
void forall_2D(int N, int X, int Y, lambda &&body)
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::LoopPolicy< RAJA::hip_thread_z_direct > hip_threads_z
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
void hypre_forall(int N, lambda &&body)
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.
bool HypreUsingGPU()
Return true if HYPRE is configured to use GPU.
void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
RAJA::LoopPolicy< RAJA::hip_block_x_direct > hip_teams_x
void RajaCuWrap1D(const int N, DBODY &&d_body)
void HipWrap1D(const int N, DBODY &&d_body)
void forall(int N, lambda &&body)
void forall_switch(bool use_dev, int N, lambda &&body)
RAJA::LoopPolicy< RAJA::cuda_block_x_direct > cuda_teams_x
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
struct s_NavierContext ctx
@ RAJA_OMP
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES.
@ RAJA_CUDA
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
@ DEBUG_DEVICE
[device] Debug backend: host memory is READ/WRITE protected while a device is in use....
@ RAJA_CPU
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES.
@ OMP
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
@ HIP
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
@ RAJA_HIP
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
@ CUDA
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
@ HIP_MASK
Biwise-OR of all HIP backends.
@ CUDA_MASK
Biwise-OR of all CUDA backends.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Maximum number of 1D DOFs or quadrature points for the current runtime configuration of the Device (u...
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
int HCURL_MAX_D1D
Maximum number of 1D nodal points for H(curl).
int HCURL_MAX_Q1D
Maximum number of 1D quadrature points for H(curl).
int HDIV_MAX_Q1D
Maximum number of 1D quadrature points for H(div).
int MAX_INTERP_1D
Maximum number of points for use in QuadratureInterpolator.
int HDIV_MAX_D1D
Maximum number of 1D nodal points for H(div).
int MAX_DET_1D
Maximum number of points for determinant computation in QuadratureInterpolator.
int MAX_D1D
Maximum number of 1D nodal points.
int MAX_Q1D
Maximum number of 1D quadrature points.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)