12 #ifndef MFEM_FORALL_HPP 13 #define MFEM_FORALL_HPP 15 #include "../config/config.hpp" 21 #include "../linalg/dtensor.hpp" 42 struct DofQuadLimits_CUDA
44 static constexpr
int MAX_D1D = 14;
45 static constexpr
int MAX_Q1D = 14;
46 static constexpr
int HCURL_MAX_D1D = 5;
47 static constexpr
int HCURL_MAX_Q1D = 6;
48 static constexpr
int HDIV_MAX_D1D = 5;
49 static constexpr
int HDIV_MAX_Q1D = 6;
50 static constexpr
int MAX_INTERP_1D = 8;
51 static constexpr
int MAX_DET_1D = 6;
54 struct DofQuadLimits_HIP
56 static constexpr
int MAX_D1D = 10;
57 static constexpr
int MAX_Q1D = 10;
58 static constexpr
int HCURL_MAX_D1D = 5;
59 static constexpr
int HCURL_MAX_Q1D = 5;
60 static constexpr
int HDIV_MAX_D1D = 5;
61 static constexpr
int HDIV_MAX_Q1D = 6;
62 static constexpr
int MAX_INTERP_1D = 8;
63 static constexpr
int MAX_DET_1D = 6;
66 struct DofQuadLimits_CPU
69 static constexpr
int MAX_D1D = 24;
70 static constexpr
int MAX_Q1D = 24;
72 static constexpr
int MAX_D1D = 14;
73 static constexpr
int MAX_Q1D = 14;
75 static constexpr
int HCURL_MAX_D1D = 10;
76 static constexpr
int HCURL_MAX_Q1D = 10;
77 static constexpr
int HDIV_MAX_D1D = 10;
78 static constexpr
int HDIV_MAX_Q1D = 10;
79 static constexpr
int MAX_INTERP_1D = MAX_D1D;
80 static constexpr
int MAX_DET_1D = MAX_D1D;
93 #if defined(__CUDA_ARCH__) 95 #elif defined(__HIP_DEVICE_COMPILE__) 125 return dof_quad_limits;
134 else { Populate<internal::DofQuadLimits_CPU>(); }
141 template <
typename T>
void Populate()
155 #define MFEM_PRAGMA(X) _Pragma(#X) 158 #if defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__) 159 #define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N)) 161 #define MFEM_UNROLL(N) 167 #if defined(MFEM_USE_CUDA) 168 #define MFEM_GPU_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE \ 169 (int i) {__VA_ARGS__}) 170 #elif defined(MFEM_USE_HIP) 171 #define MFEM_GPU_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE \ 172 (int i) {__VA_ARGS__}) 174 #define MFEM_GPU_FORALL(i, N,...) do { } while (false) 181 #define MFEM_FORALL(i,N,...) \ 182 ForallWrap<1>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__}) 185 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \ 186 ForallWrap<2>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,BZ) 189 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \ 190 ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z) 194 #define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \ 195 ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z,G) 200 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \ 201 ForallWrap<1>(use_dev,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__}) 205 template <
typename HBODY>
208 #ifdef MFEM_USE_OPENMP 209 #pragma omp parallel for 210 for (
int k = 0; k < N; k++)
215 MFEM_CONTRACT_VAR(N);
216 MFEM_CONTRACT_VAR(h_body);
217 MFEM_ABORT(
"OpenMP requested for MFEM but OpenMP is not enabled!");
223 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA) 225 RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;
227 RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
229 RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>;
232 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP) 234 RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>;
236 RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
238 RAJA::LoopPolicy<RAJA::hip_thread_z_direct>;
241 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA) 242 template <const
int BLOCKS = MFEM_CUDA_BLOCKS,
typename DBODY>
246 RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
249 template <
typename DBODY>
251 const int X,
const int Y,
const int BZ)
253 MFEM_VERIFY(N>0,
"");
254 MFEM_VERIFY(BZ>0,
"");
255 const int G = (N+BZ-1)/BZ;
257 using namespace RAJA;
258 using RAJA::RangeSegment;
260 launch<cuda_launch_policy>
261 (LaunchParams(Teams(G), Threads(X, Y, BZ)),
262 [=] RAJA_DEVICE (LaunchContext
ctx)
265 loop<cuda_teams_x>(
ctx, RangeSegment(0, G), [&] (
const int n)
268 loop<cuda_threads_z>(
ctx, RangeSegment(0, BZ), [&] (
const int tz)
271 const int k = n*BZ + tz;
272 if (k >= N) {
return; }
281 MFEM_GPU_CHECK(cudaGetLastError());
284 template <
typename DBODY>
286 const int X,
const int Y,
const int Z,
const int G)
288 MFEM_VERIFY(N>0,
"");
289 const int GRID = G == 0 ? N : G;
290 using namespace RAJA;
291 using RAJA::RangeSegment;
293 launch<cuda_launch_policy>
294 (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
295 [=] RAJA_DEVICE (LaunchContext
ctx)
298 loop<cuda_teams_x>(
ctx, RangeSegment(0, N), d_body);
302 MFEM_GPU_CHECK(cudaGetLastError());
311 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
312 static void run(
const int N, DBODY &&d_body,
313 const int X,
const int Y,
const int Z,
const int G)
315 RajaCuWrap1D<BLCK>(N, d_body);
322 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
323 static void run(
const int N, DBODY &&d_body,
324 const int X,
const int Y,
const int Z,
const int G)
333 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
334 static void run(
const int N, DBODY &&d_body,
335 const int X,
const int Y,
const int Z,
const int G)
343 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP) 344 template <const
int BLOCKS = MFEM_HIP_BLOCKS,
typename DBODY>
348 RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
351 template <
typename DBODY>
353 const int X,
const int Y,
const int BZ)
355 MFEM_VERIFY(N>0,
"");
356 MFEM_VERIFY(BZ>0,
"");
357 const int G = (N+BZ-1)/BZ;
359 using namespace RAJA;
360 using RAJA::RangeSegment;
362 launch<hip_launch_policy>
363 (LaunchParams(Teams(G), Threads(X, Y, BZ)),
364 [=] RAJA_DEVICE (LaunchContext
ctx)
367 loop<hip_teams_x>(
ctx, RangeSegment(0, G), [&] (
const int n)
370 loop<hip_threads_z>(
ctx, RangeSegment(0, BZ), [&] (
const int tz)
373 const int k = n*BZ + tz;
374 if (k >= N) {
return; }
383 MFEM_GPU_CHECK(hipGetLastError());
386 template <
typename DBODY>
388 const int X,
const int Y,
const int Z,
const int G)
390 MFEM_VERIFY(N>0,
"");
391 const int GRID = G == 0 ? N : G;
392 using namespace RAJA;
393 using RAJA::RangeSegment;
395 launch<hip_launch_policy>
396 (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
397 [=] RAJA_DEVICE (LaunchContext
ctx)
400 loop<hip_teams_x>(
ctx, RangeSegment(0, N), d_body);
404 MFEM_GPU_CHECK(hipGetLastError());
413 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
414 static void run(
const int N, DBODY &&d_body,
415 const int X,
const int Y,
const int Z,
const int G)
417 RajaHipWrap1D<BLCK>(N, d_body);
424 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
425 static void run(
const int N, DBODY &&d_body,
426 const int X,
const int Y,
const int Z,
const int G)
435 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
436 static void run(
const int N, DBODY &&d_body,
437 const int X,
const int Y,
const int Z,
const int G)
446 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP) 448 template <
typename HBODY>
451 RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
458 template <
typename HBODY>
462 RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
464 MFEM_CONTRACT_VAR(N);
465 MFEM_CONTRACT_VAR(h_body);
466 MFEM_ABORT(
"RAJA requested but RAJA is not enabled!");
474 template <
typename BODY> __global__
static 475 void CuKernel1D(
const int N, BODY body)
477 const int k = blockDim.x*blockIdx.x + threadIdx.x;
478 if (k >= N) {
return; }
482 template <
typename BODY> __global__
static 483 void CuKernel2D(
const int N, BODY body)
485 const int k = blockIdx.x*blockDim.z + threadIdx.z;
486 if (k >= N) {
return; }
490 template <
typename BODY> __global__
static 491 void CuKernel3D(
const int N, BODY body)
493 for (
int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }
496 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
499 if (N==0) {
return; }
500 const int GRID = (N+BLCK-1)/BLCK;
501 CuKernel1D<<<GRID,BLCK>>>(N, d_body);
502 MFEM_GPU_CHECK(cudaGetLastError());
505 template <
typename DBODY>
507 const int X,
const int Y,
const int BZ)
509 if (N==0) {
return; }
510 MFEM_VERIFY(BZ>0,
"");
511 const int GRID = (N+BZ-1)/BZ;
512 const dim3 BLCK(X,Y,BZ);
513 CuKernel2D<<<GRID,BLCK>>>(N,d_body);
514 MFEM_GPU_CHECK(cudaGetLastError());
517 template <
typename DBODY>
519 const int X,
const int Y,
const int Z,
const int G)
521 if (N==0) {
return; }
522 const int GRID = G == 0 ? N : G;
523 const dim3 BLCK(X,Y,Z);
524 CuKernel3D<<<GRID,BLCK>>>(N,d_body);
525 MFEM_GPU_CHECK(cudaGetLastError());
534 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
535 static void run(
const int N, DBODY &&d_body,
536 const int X,
const int Y,
const int Z,
const int G)
538 CuWrap1D<BLCK>(N, d_body);
545 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
546 static void run(
const int N, DBODY &&d_body,
547 const int X,
const int Y,
const int Z,
const int G)
556 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
557 static void run(
const int N, DBODY &&d_body,
558 const int X,
const int Y,
const int Z,
const int G)
564 #endif // MFEM_USE_CUDA 570 template <
typename BODY> __global__
static 571 void HipKernel1D(
const int N, BODY body)
573 const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
574 if (k >= N) {
return; }
578 template <
typename BODY> __global__
static 579 void HipKernel2D(
const int N, BODY body)
581 const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;
582 if (k >= N) {
return; }
586 template <
typename BODY> __global__
static 587 void HipKernel3D(
const int N, BODY body)
589 for (
int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }
592 template <const
int BLCK = MFEM_HIP_BLOCKS,
typename DBODY>
595 if (N==0) {
return; }
596 const int GRID = (N+BLCK-1)/BLCK;
597 hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
598 MFEM_GPU_CHECK(hipGetLastError());
601 template <
typename DBODY>
603 const int X,
const int Y,
const int BZ)
605 if (N==0) {
return; }
606 const int GRID = (N+BZ-1)/BZ;
607 const dim3 BLCK(X,Y,BZ);
608 hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body);
609 MFEM_GPU_CHECK(hipGetLastError());
612 template <
typename DBODY>
614 const int X,
const int Y,
const int Z,
const int G)
616 if (N==0) {
return; }
617 const int GRID = G == 0 ? N : G;
618 const dim3 BLCK(X,Y,Z);
619 hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
620 MFEM_GPU_CHECK(hipGetLastError());
629 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
630 static void run(
const int N, DBODY &&d_body,
631 const int X,
const int Y,
const int Z,
const int G)
633 HipWrap1D<BLCK>(N, d_body);
640 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
641 static void run(
const int N, DBODY &&d_body,
642 const int X,
const int Y,
const int Z,
const int G)
651 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
652 static void run(
const int N, DBODY &&d_body,
653 const int X,
const int Y,
const int Z,
const int G)
659 #endif // MFEM_USE_HIP 663 template <const
int DIM,
typename d_lambda,
typename h_lambda>
665 d_lambda &&d_body, h_lambda &&h_body,
666 const int X=0,
const int Y=0,
const int Z=0,
669 MFEM_CONTRACT_VAR(X);
670 MFEM_CONTRACT_VAR(Y);
671 MFEM_CONTRACT_VAR(Z);
672 MFEM_CONTRACT_VAR(G);
673 MFEM_CONTRACT_VAR(d_body);
674 if (!use_dev) {
goto backend_cpu; }
676 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA) 684 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP) 711 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP) 716 #ifdef MFEM_USE_OPENMP 730 for (
int k = 0; k < N; k++) { h_body(k); }
733 template <const
int DIM,
typename lambda>
734 inline void ForallWrap(
const bool use_dev,
const int N, lambda &&body,
735 const int X=0,
const int Y=0,
const int Z=0,
738 ForallWrap<DIM>(use_dev, N, body, body, X, Y, Z, G);
741 template<
typename lambda>
742 inline void forall(
int N, lambda &&body) { ForallWrap<1>(
true, N, body); }
744 template<
typename lambda>
747 ForallWrap<1>(use_dev, N, body);
750 template<
typename lambda>
751 inline void forall_2D(
int N,
int X,
int Y, lambda &&body)
753 ForallWrap<2>(
true, N, body, X, Y, 1);
756 template<
typename lambda>
759 ForallWrap<2>(
true, N, body, X, Y, BZ);
762 template<
typename lambda>
763 inline void forall_3D(
int N,
int X,
int Y,
int Z, lambda &&body)
765 ForallWrap<3>(
true, N, body, X, Y, Z, 0);
768 template<
typename lambda>
771 ForallWrap<3>(
true, N, body, X, Y, Z, G);
776 #endif // MFEM_FORALL_HPP void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void forall_2D(int N, int X, int Y, lambda &&body)
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
RAJA::LaunchPolicy< RAJA::cuda_launch_t< true > > cuda_launch_policy
RAJA Cuda and Hip backends.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::LaunchPolicy< RAJA::hip_launch_t< true > > hip_launch_policy
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES...
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Biwise-OR of all HIP backends.
void HipWrap1D(const int N, DBODY &&d_body)
int HDIV_MAX_D1D
Maximum number of 1D nodal points for H(div).
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::LoopPolicy< RAJA::cuda_thread_z_direct > cuda_threads_z
int HCURL_MAX_Q1D
Maximum number of 1D quadrature points for H(curl).
int MAX_Q1D
Maximum number of 1D quadrature points.
int MAX_DET_1D
Maximum number of points for determinant computation in QuadratureInterpolator.
void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
struct s_NavierContext ctx
internal::DofQuadLimits_CUDA DofQuadLimits
Maximum number of 1D DOFs or quadrature points for the architecture currently being compiled for (use...
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES...
void forall_switch(bool use_dev, int N, lambda &&body)
Biwise-OR of all CUDA backends.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
int MAX_D1D
Maximum number of 1D nodal points.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void forall(int N, lambda &&body)
Maximum number of 1D DOFs or quadrature points for the current runtime configuration of the Device (u...
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
void CuWrap1D(const int N, DBODY &&d_body)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
RAJA::LoopPolicy< RAJA::cuda_block_x_direct > cuda_teams_x
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::LoopPolicy< RAJA::hip_block_x_direct > hip_teams_x
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
int MAX_INTERP_1D
Maximum number of points for use in QuadratureInterpolator.
int HDIV_MAX_Q1D
Maximum number of 1D quadrature points for H(div).
void RajaCuWrap1D(const int N, DBODY &&d_body)
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
int HCURL_MAX_D1D
Maximum number of 1D nodal points for H(curl).
void RajaHipWrap1D(const int N, DBODY &&d_body)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::LoopPolicy< RAJA::hip_thread_z_direct > hip_threads_z
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
[device] Debug backend: host memory is READ/WRITE protected while a device is in use. It allows to test the "device" code-path (using separate host/device memory pools and host <-> device transfers) without any GPU hardware. As 'DEBUG' is sometimes used as a macro, _DEVICE has been added to avoid conflicts.
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.