12 #ifndef MFEM_FORALL_HPP 13 #define MFEM_FORALL_HPP 15 #include "../config/config.hpp" 21 #include "../linalg/dtensor.hpp" 36 #define MFEM_PRAGMA(X) _Pragma(#X) 39 #if defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__) 40 #define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N)) 42 #define MFEM_UNROLL(N) 48 #if defined(MFEM_USE_CUDA) 49 #define MFEM_GPU_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE \ 50 (int i) {__VA_ARGS__}) 51 #elif defined(MFEM_USE_HIP) 52 #define MFEM_GPU_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE \ 53 (int i) {__VA_ARGS__}) 55 #define MFEM_GPU_FORALL(i, N,...) do { } while (false) 62 #define MFEM_FORALL(i,N,...) \ 63 ForallWrap<1>(true,N, \ 64 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \ 65 [&] MFEM_LAMBDA (int i) {__VA_ARGS__}) 68 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \ 69 ForallWrap<2>(true,N, \ 70 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \ 71 [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\ 75 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \ 76 ForallWrap<3>(true,N, \ 77 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \ 78 [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\ 83 #define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \ 84 ForallWrap<3>(true,N, \ 85 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \ 86 [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\ 92 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \ 93 ForallWrap<1>(use_dev,N, \ 94 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \ 95 [&] MFEM_LAMBDA (int i) {__VA_ARGS__}) 99 template <
typename HBODY>
102 #ifdef MFEM_USE_OPENMP 103 #pragma omp parallel for 104 for (
int k = 0; k < N; k++)
109 MFEM_CONTRACT_VAR(N);
110 MFEM_CONTRACT_VAR(h_body);
111 MFEM_ABORT(
"OpenMP requested for MFEM but OpenMP is not enabled!");
117 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA) 119 RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;
121 RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
123 RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>;
126 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP) 128 RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>;
130 RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
132 RAJA::LoopPolicy<RAJA::hip_thread_z_direct>;
135 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA) 136 template <const
int BLOCKS = MFEM_CUDA_BLOCKS,
typename DBODY>
140 RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
143 template <
typename DBODY>
145 const int X,
const int Y,
const int BZ)
147 MFEM_VERIFY(N>0,
"");
148 MFEM_VERIFY(BZ>0,
"");
149 const int G = (N+BZ-1)/BZ;
151 using namespace RAJA;
152 using RAJA::RangeSegment;
154 launch<cuda_launch_policy>
155 (LaunchParams(Teams(G), Threads(X, Y, BZ)),
156 [=] RAJA_DEVICE (LaunchContext
ctx)
159 loop<cuda_teams_x>(
ctx, RangeSegment(0, G), [&] (
const int n)
162 loop<cuda_threads_z>(
ctx, RangeSegment(0, BZ), [&] (
const int tz)
165 const int k = n*BZ + tz;
166 if (k >= N) {
return; }
175 MFEM_GPU_CHECK(cudaGetLastError());
178 template <
typename DBODY>
180 const int X,
const int Y,
const int Z,
const int G)
182 MFEM_VERIFY(N>0,
"");
183 const int GRID = G == 0 ? N : G;
184 using namespace RAJA;
185 using RAJA::RangeSegment;
187 launch<cuda_launch_policy>
188 (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
189 [=] RAJA_DEVICE (LaunchContext
ctx)
192 loop<cuda_teams_x>(
ctx, RangeSegment(0, N), d_body);
196 MFEM_GPU_CHECK(cudaGetLastError());
205 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
206 static void run(
const int N, DBODY &&d_body,
207 const int X,
const int Y,
const int Z,
const int G)
209 RajaCuWrap1D<BLCK>(N, d_body);
216 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
217 static void run(
const int N, DBODY &&d_body,
218 const int X,
const int Y,
const int Z,
const int G)
227 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
228 static void run(
const int N, DBODY &&d_body,
229 const int X,
const int Y,
const int Z,
const int G)
237 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP) 238 template <const
int BLOCKS = MFEM_HIP_BLOCKS,
typename DBODY>
242 RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
245 template <
typename DBODY>
247 const int X,
const int Y,
const int BZ)
249 MFEM_VERIFY(N>0,
"");
250 MFEM_VERIFY(BZ>0,
"");
251 const int G = (N+BZ-1)/BZ;
253 using namespace RAJA;
254 using RAJA::RangeSegment;
256 launch<hip_launch_policy>
257 (LaunchParams(Teams(G), Threads(X, Y, BZ)),
258 [=] RAJA_DEVICE (LaunchContext
ctx)
261 loop<hip_teams_x>(
ctx, RangeSegment(0, G), [&] (
const int n)
264 loop<hip_threads_z>(
ctx, RangeSegment(0, BZ), [&] (
const int tz)
267 const int k = n*BZ + tz;
268 if (k >= N) {
return; }
277 MFEM_GPU_CHECK(hipGetLastError());
280 template <
typename DBODY>
282 const int X,
const int Y,
const int Z,
const int G)
284 MFEM_VERIFY(N>0,
"");
285 const int GRID = G == 0 ? N : G;
286 using namespace RAJA;
287 using RAJA::RangeSegment;
289 launch<hip_launch_policy>
290 (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
291 [=] RAJA_DEVICE (LaunchContext
ctx)
294 loop<hip_teams_x>(
ctx, RangeSegment(0, N), d_body);
298 MFEM_GPU_CHECK(hipGetLastError());
307 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
308 static void run(
const int N, DBODY &&d_body,
309 const int X,
const int Y,
const int Z,
const int G)
311 RajaHipWrap1D<BLCK>(N, d_body);
318 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
319 static void run(
const int N, DBODY &&d_body,
320 const int X,
const int Y,
const int Z,
const int G)
329 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
330 static void run(
const int N, DBODY &&d_body,
331 const int X,
const int Y,
const int Z,
const int G)
340 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP) 342 template <
typename HBODY>
345 RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
352 template <
typename HBODY>
356 RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
358 MFEM_CONTRACT_VAR(N);
359 MFEM_CONTRACT_VAR(h_body);
360 MFEM_ABORT(
"RAJA requested but RAJA is not enabled!");
368 template <
typename BODY> __global__
static 369 void CuKernel1D(
const int N, BODY body)
371 const int k = blockDim.x*blockIdx.x + threadIdx.x;
372 if (k >= N) {
return; }
376 template <
typename BODY> __global__
static 377 void CuKernel2D(
const int N, BODY body)
379 const int k = blockIdx.x*blockDim.z + threadIdx.z;
380 if (k >= N) {
return; }
384 template <
typename BODY> __global__
static 385 void CuKernel3D(
const int N, BODY body)
387 for (
int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }
390 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
393 if (N==0) {
return; }
394 const int GRID = (N+BLCK-1)/BLCK;
395 CuKernel1D<<<GRID,BLCK>>>(N, d_body);
396 MFEM_GPU_CHECK(cudaGetLastError());
399 template <
typename DBODY>
401 const int X,
const int Y,
const int BZ)
403 if (N==0) {
return; }
404 MFEM_VERIFY(BZ>0,
"");
405 const int GRID = (N+BZ-1)/BZ;
406 const dim3 BLCK(X,Y,BZ);
407 CuKernel2D<<<GRID,BLCK>>>(N,d_body);
408 MFEM_GPU_CHECK(cudaGetLastError());
411 template <
typename DBODY>
413 const int X,
const int Y,
const int Z,
const int G)
415 if (N==0) {
return; }
416 const int GRID = G == 0 ? N : G;
417 const dim3 BLCK(X,Y,Z);
418 CuKernel3D<<<GRID,BLCK>>>(N,d_body);
419 MFEM_GPU_CHECK(cudaGetLastError());
428 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
429 static void run(
const int N, DBODY &&d_body,
430 const int X,
const int Y,
const int Z,
const int G)
432 CuWrap1D<BLCK>(N, d_body);
439 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
440 static void run(
const int N, DBODY &&d_body,
441 const int X,
const int Y,
const int Z,
const int G)
450 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
451 static void run(
const int N, DBODY &&d_body,
452 const int X,
const int Y,
const int Z,
const int G)
458 #endif // MFEM_USE_CUDA 464 template <
typename BODY> __global__
static 465 void HipKernel1D(
const int N, BODY body)
467 const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
468 if (k >= N) {
return; }
472 template <
typename BODY> __global__
static 473 void HipKernel2D(
const int N, BODY body)
475 const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;
476 if (k >= N) {
return; }
480 template <
typename BODY> __global__
static 481 void HipKernel3D(
const int N, BODY body)
483 for (
int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }
486 template <const
int BLCK = MFEM_HIP_BLOCKS,
typename DBODY>
489 if (N==0) {
return; }
490 const int GRID = (N+BLCK-1)/BLCK;
491 hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
492 MFEM_GPU_CHECK(hipGetLastError());
495 template <
typename DBODY>
497 const int X,
const int Y,
const int BZ)
499 if (N==0) {
return; }
500 const int GRID = (N+BZ-1)/BZ;
501 const dim3 BLCK(X,Y,BZ);
502 hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body);
503 MFEM_GPU_CHECK(hipGetLastError());
506 template <
typename DBODY>
508 const int X,
const int Y,
const int Z,
const int G)
510 if (N==0) {
return; }
511 const int GRID = G == 0 ? N : G;
512 const dim3 BLCK(X,Y,Z);
513 hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
514 MFEM_GPU_CHECK(hipGetLastError());
523 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
524 static void run(
const int N, DBODY &&d_body,
525 const int X,
const int Y,
const int Z,
const int G)
527 HipWrap1D<BLCK>(N, d_body);
534 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
535 static void run(
const int N, DBODY &&d_body,
536 const int X,
const int Y,
const int Z,
const int G)
545 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
546 static void run(
const int N, DBODY &&d_body,
547 const int X,
const int Y,
const int Z,
const int G)
553 #endif // MFEM_USE_HIP 557 template <const
int DIM,
typename DBODY,
typename HBODY>
559 DBODY &&d_body, HBODY &&h_body,
560 const int X=0,
const int Y=0,
const int Z=0,
563 MFEM_CONTRACT_VAR(X);
564 MFEM_CONTRACT_VAR(Y);
565 MFEM_CONTRACT_VAR(Z);
566 MFEM_CONTRACT_VAR(G);
567 MFEM_CONTRACT_VAR(d_body);
568 if (!use_dev) {
goto backend_cpu; }
570 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA) 578 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP) 605 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP) 610 #ifdef MFEM_USE_OPENMP 624 for (
int k = 0; k < N; k++) { h_body(k); }
629 #endif // MFEM_FORALL_HPP void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void ForallWrap(const bool use_dev, const int N, DBODY &&d_body, HBODY &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
RAJA::LaunchPolicy< RAJA::cuda_launch_t< true > > cuda_launch_policy
RAJA Cuda and Hip backends.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::LaunchPolicy< RAJA::hip_launch_t< true > > hip_launch_policy
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES...
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void HipWrap1D(const int N, DBODY &&d_body)
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::LoopPolicy< RAJA::cuda_thread_z_direct > cuda_threads_z
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
struct s_NavierContext ctx
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES...
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
void CuWrap1D(const int N, DBODY &&d_body)
RAJA::LoopPolicy< RAJA::cuda_block_x_direct > cuda_teams_x
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::LoopPolicy< RAJA::hip_block_x_direct > hip_teams_x
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void RajaCuWrap1D(const int N, DBODY &&d_body)
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void RajaHipWrap1D(const int N, DBODY &&d_body)
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::LoopPolicy< RAJA::hip_thread_z_direct > hip_threads_z
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
[device] Debug backend: host memory is READ/WRITE protected while a device is in use. It allows to test the "device" code-path (using separate host/device memory pools and host <-> device transfers) without any GPU hardware. As 'DEBUG' is sometimes used as a macro, _DEVICE has been added to avoid conflicts.
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.