12 #ifndef MFEM_FORALL_HPP
13 #define MFEM_FORALL_HPP
15 #include "../config/config.hpp"
21 #include "../linalg/dtensor.hpp"
36 #define MFEM_PRAGMA(X) _Pragma(#X)
39 #if defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__)
40 #define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N))
42 #define MFEM_UNROLL(N)
48 #if defined(MFEM_USE_CUDA)
49 #define MFEM_GPU_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE \
50 (int i) {__VA_ARGS__})
51 #elif defined(MFEM_USE_HIP)
52 #define MFEM_GPU_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE \
53 (int i) {__VA_ARGS__})
55 #define MFEM_GPU_FORALL(i, N,...) do { } while (false)
62 #define MFEM_FORALL(i,N,...) \
63 ForallWrap<1>(true,N, \
64 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
65 [&] MFEM_LAMBDA (int i) {__VA_ARGS__})
68 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
69 ForallWrap<2>(true,N, \
70 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
71 [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
75 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
76 ForallWrap<3>(true,N, \
77 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
78 [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
83 #define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \
84 ForallWrap<3>(true,N, \
85 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
86 [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
92 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
93 ForallWrap<1>(use_dev,N, \
94 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
95 [&] MFEM_LAMBDA (int i) {__VA_ARGS__})
99 template <
typename HBODY>
102 #ifdef MFEM_USE_OPENMP
103 #pragma omp parallel for
104 for (
int k = 0; k < N; k++)
109 MFEM_CONTRACT_VAR(N);
110 MFEM_CONTRACT_VAR(h_body);
111 MFEM_ABORT(
"OpenMP requested for MFEM but OpenMP is not enabled!");
117 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
119 RAJA::expt::LaunchPolicy<RAJA::expt::null_launch_t, RAJA::expt::cuda_launch_t<true>>;
121 RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_block_x_direct>;
123 RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_z_direct>;
126 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
128 RAJA::expt::LaunchPolicy<RAJA::expt::null_launch_t, RAJA::expt::hip_launch_t<true>>;
130 RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_block_x_direct>;
132 RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_z_direct>;
135 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
136 template <const
int BLOCKS = MFEM_CUDA_BLOCKS,
typename DBODY>
140 RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
143 template <
typename DBODY>
145 const int X,
const int Y,
const int BZ)
147 MFEM_VERIFY(N>0,
"");
148 MFEM_VERIFY(BZ>0,
"");
149 const int G = (N+BZ-1)/BZ;
151 using namespace RAJA::expt;
152 using RAJA::RangeSegment;
154 launch<cuda_launch_policy>
155 (
DEVICE, Grid(Teams(G), Threads(X, Y, BZ)),
156 [=] RAJA_DEVICE (LaunchContext
ctx)
159 loop<cuda_teams_x>(
ctx, RangeSegment(0, G), [&] (
const int n)
162 loop<cuda_threads_z>(
ctx, RangeSegment(0, BZ), [&] (
const int tz)
165 const int k = n*BZ + tz;
166 if (k >= N) {
return; }
175 MFEM_GPU_CHECK(cudaGetLastError());
178 template <
typename DBODY>
180 const int X,
const int Y,
const int Z,
const int G)
182 MFEM_VERIFY(N>0,
"");
183 const int GRID = G == 0 ? N : G;
184 using namespace RAJA::expt;
185 using RAJA::RangeSegment;
187 launch<cuda_launch_policy>
188 (
DEVICE, Grid(Teams(GRID), Threads(X, Y, Z)),
189 [=] RAJA_DEVICE (LaunchContext
ctx)
192 loop<cuda_teams_x>(
ctx, RangeSegment(0, N), d_body);
196 MFEM_GPU_CHECK(cudaGetLastError());
205 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
206 static void run(
const int N, DBODY &&d_body,
207 const int X,
const int Y,
const int Z,
const int G)
209 RajaCuWrap1D<BLCK>(N, d_body);
216 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
217 static void run(
const int N, DBODY &&d_body,
218 const int X,
const int Y,
const int Z,
const int G)
227 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
228 static void run(
const int N, DBODY &&d_body,
229 const int X,
const int Y,
const int Z,
const int G)
237 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
238 template <const
int BLOCKS = MFEM_HIP_BLOCKS,
typename DBODY>
242 RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
245 template <
typename DBODY>
247 const int X,
const int Y,
const int BZ)
249 MFEM_VERIFY(N>0,
"");
250 MFEM_VERIFY(BZ>0,
"");
251 const int G = (N+BZ-1)/BZ;
253 using namespace RAJA::expt;
254 using RAJA::RangeSegment;
256 launch<hip_launch_policy>
257 (
DEVICE, Grid(Teams(G), Threads(X, Y, BZ)),
258 [=] RAJA_DEVICE (LaunchContext
ctx)
261 loop<hip_teams_x>(
ctx, RangeSegment(0, G), [&] (
const int n)
264 loop<hip_threads_z>(
ctx, RangeSegment(0, BZ), [&] (
const int tz)
267 const int k = n*BZ + tz;
268 if (k >= N) {
return; }
277 MFEM_GPU_CHECK(hipGetLastError());
280 template <
typename DBODY>
282 const int X,
const int Y,
const int Z,
const int G)
284 MFEM_VERIFY(N>0,
"");
285 const int GRID = G == 0 ? N : G;
286 using namespace RAJA::expt;
287 using RAJA::RangeSegment;
289 launch<hip_launch_policy>
290 (
DEVICE, Grid(Teams(GRID), Threads(X, Y, Z)),
291 [=] RAJA_DEVICE (LaunchContext
ctx)
294 loop<hip_teams_x>(
ctx, RangeSegment(0, N), d_body);
298 MFEM_GPU_CHECK(hipGetLastError());
307 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
308 static void run(
const int N, DBODY &&d_body,
309 const int X,
const int Y,
const int Z,
const int G)
311 RajaHipWrap1D<BLCK>(N, d_body);
318 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
319 static void run(
const int N, DBODY &&d_body,
320 const int X,
const int Y,
const int Z,
const int G)
329 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
330 static void run(
const int N, DBODY &&d_body,
331 const int X,
const int Y,
const int Z,
const int G)
340 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
342 template <
typename HBODY>
345 RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
352 template <
typename HBODY>
356 RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
358 MFEM_CONTRACT_VAR(N);
359 MFEM_CONTRACT_VAR(h_body);
360 MFEM_ABORT(
"RAJA requested but RAJA is not enabled!");
368 template <
typename BODY> __global__
static
369 void CuKernel1D(
const int N, BODY body)
371 const int k = blockDim.x*blockIdx.x + threadIdx.x;
372 if (k >= N) {
return; }
376 template <
typename BODY> __global__
static
377 void CuKernel2D(
const int N, BODY body)
379 const int k = blockIdx.x*blockDim.z + threadIdx.z;
380 if (k >= N) {
return; }
384 template <
typename BODY> __global__
static
385 void CuKernel3D(
const int N, BODY body)
387 for (
int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }
390 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
393 if (N==0) {
return; }
394 const int GRID = (N+BLCK-1)/BLCK;
395 CuKernel1D<<<GRID,BLCK>>>(N, d_body);
396 MFEM_GPU_CHECK(cudaGetLastError());
399 template <
typename DBODY>
401 const int X,
const int Y,
const int BZ)
403 if (N==0) {
return; }
404 MFEM_VERIFY(BZ>0,
"");
405 const int GRID = (N+BZ-1)/BZ;
406 const dim3 BLCK(X,Y,BZ);
407 CuKernel2D<<<GRID,BLCK>>>(N,d_body);
408 MFEM_GPU_CHECK(cudaGetLastError());
411 template <
typename DBODY>
413 const int X,
const int Y,
const int Z,
const int G)
415 if (N==0) {
return; }
416 const int GRID = G == 0 ? N : G;
417 const dim3 BLCK(X,Y,Z);
418 CuKernel3D<<<GRID,BLCK>>>(N,d_body);
419 MFEM_GPU_CHECK(cudaGetLastError());
428 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
429 static void run(
const int N, DBODY &&d_body,
430 const int X,
const int Y,
const int Z,
const int G)
432 CuWrap1D<BLCK>(N, d_body);
439 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
440 static void run(
const int N, DBODY &&d_body,
441 const int X,
const int Y,
const int Z,
const int G)
450 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
451 static void run(
const int N, DBODY &&d_body,
452 const int X,
const int Y,
const int Z,
const int G)
458 #endif // MFEM_USE_CUDA
464 template <
typename BODY> __global__
static
465 void HipKernel1D(
const int N, BODY body)
467 const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
468 if (k >= N) {
return; }
472 template <
typename BODY> __global__
static
473 void HipKernel2D(
const int N, BODY body)
475 const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;
476 if (k >= N) {
return; }
480 template <
typename BODY> __global__
static
481 void HipKernel3D(
const int N, BODY body)
483 for (
int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }
486 template <const
int BLCK = MFEM_HIP_BLOCKS,
typename DBODY>
489 if (N==0) {
return; }
490 const int GRID = (N+BLCK-1)/BLCK;
491 hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
492 MFEM_GPU_CHECK(hipGetLastError());
495 template <
typename DBODY>
497 const int X,
const int Y,
const int BZ)
499 if (N==0) {
return; }
500 const int GRID = (N+BZ-1)/BZ;
501 const dim3 BLCK(X,Y,BZ);
502 hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body);
503 MFEM_GPU_CHECK(hipGetLastError());
506 template <
typename DBODY>
508 const int X,
const int Y,
const int Z,
const int G)
510 if (N==0) {
return; }
511 const int GRID = G == 0 ? N : G;
512 const dim3 BLCK(X,Y,Z);
513 hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
514 MFEM_GPU_CHECK(hipGetLastError());
523 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
524 static void run(
const int N, DBODY &&d_body,
525 const int X,
const int Y,
const int Z,
const int G)
527 HipWrap1D<BLCK>(N, d_body);
534 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
535 static void run(
const int N, DBODY &&d_body,
536 const int X,
const int Y,
const int Z,
const int G)
545 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
546 static void run(
const int N, DBODY &&d_body,
547 const int X,
const int Y,
const int Z,
const int G)
553 #endif // MFEM_USE_HIP
557 template <const
int DIM,
typename DBODY,
typename HBODY>
559 DBODY &&d_body, HBODY &&h_body,
560 const int X=0,
const int Y=0,
const int Z=0,
563 MFEM_CONTRACT_VAR(X);
564 MFEM_CONTRACT_VAR(Y);
565 MFEM_CONTRACT_VAR(Z);
566 MFEM_CONTRACT_VAR(G);
567 MFEM_CONTRACT_VAR(d_body);
568 if (!use_dev) {
goto backend_cpu; }
570 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
578 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
605 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
610 #ifdef MFEM_USE_OPENMP
624 for (
int k = 0; k < N; k++) { h_body(k); }
629 #endif // MFEM_FORALL_HPP
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::expt::LaunchPolicy< RAJA::expt::null_launch_t, RAJA::expt::hip_launch_t< true >> hip_launch_policy
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void ForallWrap(const bool use_dev, const int N, DBODY &&d_body, HBODY &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Device memory; using CUDA or HIP *Malloc and *Free.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES...
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void HipWrap1D(const int N, DBODY &&d_body)
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::expt::LoopPolicy< RAJA::loop_exec, RAJA::cuda_thread_z_direct > cuda_threads_z
RAJA::expt::LaunchPolicy< RAJA::expt::null_launch_t, RAJA::expt::cuda_launch_t< true >> cuda_launch_policy
RAJA Cuda and Hip backends.
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::expt::LoopPolicy< RAJA::loop_exec, RAJA::hip_thread_z_direct > hip_threads_z
struct s_NavierContext ctx
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES...
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
void CuWrap1D(const int N, DBODY &&d_body)
RAJA::expt::LoopPolicy< RAJA::loop_exec, RAJA::hip_block_x_direct > hip_teams_x
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
RAJA::expt::LoopPolicy< RAJA::loop_exec, RAJA::cuda_block_x_direct > cuda_teams_x
void RajaCuWrap1D(const int N, DBODY &&d_body)
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void RajaHipWrap1D(const int N, DBODY &&d_body)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
[device] Debug backend: host memory is READ/WRITE protected while a device is in use. It allows to test the "device" code-path (using separate host/device memory pools and host <-> device transfers) without any GPU hardware. As 'DEBUG' is sometimes used as a macro, _DEVICE has been added to avoid conflicts.
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.