12 #ifndef MFEM_FORALL_HPP
13 #define MFEM_FORALL_HPP
15 #include "../config/config.hpp"
20 #include "../linalg/dtensor.hpp"
35 #define MFEM_PRAGMA(X) _Pragma(#X)
38 #if defined(MFEM_USE_CUDA)
39 #define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N))
41 #define MFEM_UNROLL(N)
48 #define MFEM_FORALL(i,N,...) \
49 ForallWrap<1>(true,N, \
50 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
51 [&] MFEM_LAMBDA (int i) {__VA_ARGS__})
54 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
55 ForallWrap<2>(true,N, \
56 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
57 [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
61 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
62 ForallWrap<3>(true,N, \
63 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
64 [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
69 #define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \
70 ForallWrap<3>(true,N, \
71 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
72 [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
78 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
79 ForallWrap<1>(use_dev,N, \
80 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
81 [&] MFEM_LAMBDA (int i) {__VA_ARGS__})
85 template <
typename HBODY>
88 #ifdef MFEM_USE_OPENMP
89 #pragma omp parallel for
90 for (
int k = 0; k < N; k++)
96 MFEM_CONTRACT_VAR(h_body);
97 MFEM_ABORT(
"OpenMP requested for MFEM but OpenMP is not enabled!");
103 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
105 RAJA::expt::LaunchPolicy<RAJA::expt::null_launch_t, RAJA::expt::cuda_launch_t<true>>;
107 RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_block_x_direct>;
109 RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_z_direct>;
112 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
114 RAJA::expt::LaunchPolicy<RAJA::expt::null_launch_t, RAJA::expt::hip_launch_t<true>>;
116 RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_block_x_direct>;
118 RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_z_direct>;
121 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
122 template <const
int BLOCKS = MFEM_CUDA_BLOCKS,
typename DBODY>
126 RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
129 template <
typename DBODY>
131 const int X,
const int Y,
const int BZ)
133 MFEM_VERIFY(N>0,
"");
134 MFEM_VERIFY(BZ>0,
"");
135 const int G = (N+BZ-1)/BZ;
137 using namespace RAJA::expt;
138 using RAJA::RangeSegment;
140 launch<cuda_launch_policy>
141 (
DEVICE, Resources(Teams(G), Threads(X, Y, BZ)),
142 [=] RAJA_DEVICE (LaunchContext
ctx)
145 loop<cuda_teams_x>(
ctx, RangeSegment(0, G), [&] (
const int n)
148 loop<cuda_threads_z>(
ctx, RangeSegment(0, BZ), [&] (
const int tz)
151 const int k = n*BZ + tz;
152 if (k >= N) {
return; }
161 MFEM_GPU_CHECK(cudaGetLastError());
164 template <
typename DBODY>
166 const int X,
const int Y,
const int Z,
const int G)
168 MFEM_VERIFY(N>0,
"");
169 const int GRID = G == 0 ? N : G;
170 using namespace RAJA::expt;
171 using RAJA::RangeSegment;
173 launch<cuda_launch_policy>
174 (
DEVICE, Resources(Teams(GRID), Threads(X, Y, Z)),
175 [=] RAJA_DEVICE (LaunchContext
ctx)
178 loop<cuda_teams_x>(
ctx, RangeSegment(0, N), d_body);
182 MFEM_GPU_CHECK(cudaGetLastError());
187 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
188 template <const
int BLOCKS = MFEM_HIP_BLOCKS,
typename DBODY>
192 RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
195 template <
typename DBODY>
197 const int X,
const int Y,
const int BZ)
199 MFEM_VERIFY(N>0,
"");
200 MFEM_VERIFY(BZ>0,
"");
201 const int G = (N+BZ-1)/BZ;
203 using namespace RAJA::expt;
204 using RAJA::RangeSegment;
206 launch<hip_launch_policy>
207 (
DEVICE, Resources(Teams(G), Threads(X, Y, BZ)),
208 [=] RAJA_DEVICE (LaunchContext
ctx)
211 loop<hip_teams_x>(
ctx, RangeSegment(0, G), [&] (
const int n)
214 loop<hip_threads_z>(
ctx, RangeSegment(0, BZ), [&] (
const int tz)
217 const int k = n*BZ + tz;
218 if (k >= N) {
return; }
227 MFEM_GPU_CHECK(hipGetLastError());
230 template <
typename DBODY>
232 const int X,
const int Y,
const int Z,
const int G)
234 MFEM_VERIFY(N>0,
"");
235 const int GRID = G == 0 ? N : G;
236 using namespace RAJA::expt;
237 using RAJA::RangeSegment;
239 launch<hip_launch_policy>
240 (
DEVICE, Resources(Teams(GRID), Threads(X, Y, Z)),
241 [=] RAJA_DEVICE (LaunchContext
ctx)
244 loop<hip_teams_x>(
ctx, RangeSegment(0, N), d_body);
248 MFEM_GPU_CHECK(hipGetLastError());
253 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
255 template <
typename HBODY>
258 RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
265 template <
typename HBODY>
269 RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
271 MFEM_CONTRACT_VAR(N);
272 MFEM_CONTRACT_VAR(h_body);
273 MFEM_ABORT(
"RAJA requested but RAJA is not enabled!");
281 template <
typename BODY> __global__
static
282 void CuKernel1D(
const int N, BODY body)
284 const int k = blockDim.x*blockIdx.x + threadIdx.x;
285 if (k >= N) {
return; }
289 template <
typename BODY> __global__
static
290 void CuKernel2D(
const int N, BODY body)
292 const int k = blockIdx.x*blockDim.z + threadIdx.z;
293 if (k >= N) {
return; }
297 template <
typename BODY> __global__
static
298 void CuKernel3D(
const int N, BODY body)
300 for (
int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }
303 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
306 if (N==0) {
return; }
307 const int GRID = (N+BLCK-1)/BLCK;
308 CuKernel1D<<<GRID,BLCK>>>(N, d_body);
309 MFEM_GPU_CHECK(cudaGetLastError());
312 template <
typename DBODY>
314 const int X,
const int Y,
const int BZ)
316 if (N==0) {
return; }
317 MFEM_VERIFY(BZ>0,
"");
318 const int GRID = (N+BZ-1)/BZ;
319 const dim3 BLCK(X,Y,BZ);
320 CuKernel2D<<<GRID,BLCK>>>(N,d_body);
321 MFEM_GPU_CHECK(cudaGetLastError());
324 template <
typename DBODY>
326 const int X,
const int Y,
const int Z,
const int G)
328 if (N==0) {
return; }
329 const int GRID = G == 0 ? N : G;
330 const dim3 BLCK(X,Y,Z);
331 CuKernel3D<<<GRID,BLCK>>>(N,d_body);
332 MFEM_GPU_CHECK(cudaGetLastError());
335 #endif // MFEM_USE_CUDA
341 template <
typename BODY> __global__
static
342 void HipKernel1D(
const int N, BODY body)
344 const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
345 if (k >= N) {
return; }
349 template <
typename BODY> __global__
static
350 void HipKernel2D(
const int N, BODY body)
352 const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;
353 if (k >= N) {
return; }
357 template <
typename BODY> __global__
static
358 void HipKernel3D(
const int N, BODY body)
360 for (
int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }
363 template <const
int BLCK = MFEM_HIP_BLOCKS,
typename DBODY>
366 if (N==0) {
return; }
367 const int GRID = (N+BLCK-1)/BLCK;
368 hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
369 MFEM_GPU_CHECK(hipGetLastError());
372 template <
typename DBODY>
374 const int X,
const int Y,
const int BZ)
376 if (N==0) {
return; }
377 const int GRID = (N+BZ-1)/BZ;
378 const dim3 BLCK(X,Y,BZ);
379 hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body);
380 MFEM_GPU_CHECK(hipGetLastError());
383 template <
typename DBODY>
385 const int X,
const int Y,
const int Z,
const int G)
387 if (N==0) {
return; }
388 const int GRID = G == 0 ? N : G;
389 const dim3 BLCK(X,Y,Z);
390 hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
391 MFEM_GPU_CHECK(hipGetLastError());
394 #endif // MFEM_USE_HIP
398 template <const
int DIM,
typename DBODY,
typename HBODY>
400 DBODY &&d_body, HBODY &&h_body,
401 const int X=0,
const int Y=0,
const int Z=0,
404 MFEM_CONTRACT_VAR(X);
405 MFEM_CONTRACT_VAR(Y);
406 MFEM_CONTRACT_VAR(Z);
407 MFEM_CONTRACT_VAR(G);
408 MFEM_CONTRACT_VAR(d_body);
409 if (!use_dev) {
goto backend_cpu; }
411 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
421 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
436 if (
DIM == 2) {
return CuWrap2D(N, d_body, X, Y, Z); }
437 if (
DIM == 3) {
return CuWrap3D(N, d_body, X, Y, Z, G); }
447 if (
DIM == 3) {
return HipWrap3D(N, d_body, X, Y, Z, G); }
454 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
459 #ifdef MFEM_USE_OPENMP
473 for (
int k = 0; k < N; k++) { h_body(k); }
478 #endif // MFEM_FORALL_HPP
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::expt::LaunchPolicy< RAJA::expt::null_launch_t, RAJA::expt::hip_launch_t< true >> hip_launch_policy
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
void ForallWrap(const bool use_dev, const int N, DBODY &&d_body, HBODY &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Device memory; using CUDA or HIP *Malloc and *Free.
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES...
void HipWrap1D(const int N, DBODY &&d_body)
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
RAJA::expt::LoopPolicy< RAJA::loop_exec, RAJA::cuda_thread_z_direct > cuda_threads_z
RAJA::expt::LaunchPolicy< RAJA::expt::null_launch_t, RAJA::expt::cuda_launch_t< true >> cuda_launch_policy
RAJA Cuda and Hip backends.
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
RAJA::expt::LoopPolicy< RAJA::loop_exec, RAJA::hip_thread_z_direct > hip_threads_z
struct s_NavierContext ctx
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES...
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
void CuWrap1D(const int N, DBODY &&d_body)
RAJA::expt::LoopPolicy< RAJA::loop_exec, RAJA::hip_block_x_direct > hip_teams_x
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
RAJA::expt::LoopPolicy< RAJA::loop_exec, RAJA::cuda_block_x_direct > cuda_teams_x
void RajaCuWrap1D(const int N, DBODY &&d_body)
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void RajaHipWrap1D(const int N, DBODY &&d_body)
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
[device] Debug backend: host memory is READ/WRITE protected while a device is in use. It allows to test the "device" code-path (using separate host/device memory pools and host <-> device transfers) without any GPU hardware. As 'DEBUG' is sometimes used as a macro, _DEVICE has been added to avoid conflicts.
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.