12 #ifndef MFEM_FORALL_HPP
13 #define MFEM_FORALL_HPP
15 #include "../config/config.hpp"
20 #include "../linalg/dtensor.hpp"
30 #define MFEM_PRAGMA(X) _Pragma(#X)
33 #if defined(MFEM_USE_CUDA)
34 #define MFEM_UNROLL(N) MFEM_PRAGMA(unroll N)
36 #define MFEM_UNROLL(N)
43 #define MFEM_FORALL(i,N,...) \
44 ForallWrap<1>(true,N, \
45 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
46 [&] MFEM_LAMBDA (int i) {__VA_ARGS__})
49 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
50 ForallWrap<2>(true,N, \
51 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
52 [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
56 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
57 ForallWrap<3>(true,N, \
58 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
59 [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
65 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
66 ForallWrap<1>(use_dev,N, \
67 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
68 [&] MFEM_LAMBDA (int i) {__VA_ARGS__})
72 template <
typename HBODY>
75 #ifdef MFEM_USE_OPENMP
76 #pragma omp parallel for
77 for (
int k = 0; k < N; k++)
83 MFEM_CONTRACT_VAR(h_body);
84 MFEM_ABORT(
"OpenMP requested for MFEM but OpenMP is not enabled!");
90 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
92 #if RAJA_VERSION_MAJOR == 0 && RAJA_VERSION_MINOR < 12
93 using RAJA::statement::Segs;
98 template <const
int BLOCKS = MFEM_CUDA_BLOCKS,
typename DBODY>
102 RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
105 template <
typename DBODY>
107 const int X,
const int Y,
const int BZ)
109 MFEM_VERIFY(N>0,
"");
110 MFEM_VERIFY(BZ>0,
"");
111 const int G = (N+BZ-1)/BZ;
112 RAJA::kernel<RAJA::KernelPolicy<
113 RAJA::statement::CudaKernelAsync<
114 RAJA::statement::For<0, RAJA::cuda_block_x_direct,
115 RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
116 RAJA::statement::For<2, RAJA::cuda_thread_y_direct,
117 RAJA::statement::For<3, RAJA::cuda_thread_z_direct,
118 RAJA::statement::Lambda<0, Segs<0>>>>>>>>>
119 (RAJA::make_tuple(RAJA::RangeSegment(0,G), RAJA::RangeSegment(0,X),
120 RAJA::RangeSegment(0,Y), RAJA::RangeSegment(0,BZ)),
121 [=] RAJA_DEVICE (
const int n)
123 const int k = n*BZ + threadIdx.z;
124 if (k >= N) {
return; }
127 MFEM_GPU_CHECK(cudaGetLastError());
130 template <
typename DBODY>
132 const int X,
const int Y,
const int Z)
134 MFEM_VERIFY(N>0,
"");
135 RAJA::kernel<RAJA::KernelPolicy<
136 RAJA::statement::CudaKernelAsync<
137 RAJA::statement::For<0, RAJA::cuda_block_x_direct,
138 RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
139 RAJA::statement::For<2, RAJA::cuda_thread_y_direct,
140 RAJA::statement::For<3, RAJA::cuda_thread_z_direct,
141 RAJA::statement::Lambda<0, Segs<0>>>>>>>>>
142 (RAJA::make_tuple(RAJA::RangeSegment(0,N), RAJA::RangeSegment(0,X),
143 RAJA::RangeSegment(0,Y), RAJA::RangeSegment(0,Z)),
144 [=] RAJA_DEVICE (
const int k) { d_body(k); });
145 MFEM_GPU_CHECK(cudaGetLastError());
152 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
154 #if RAJA_VERSION_MAJOR == 0 && RAJA_VERSION_MINOR < 12
155 using RAJA::statement::Segs;
160 template <
typename HBODY>
163 RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
170 template <
typename HBODY>
174 RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
176 MFEM_CONTRACT_VAR(N);
177 MFEM_CONTRACT_VAR(h_body);
178 MFEM_ABORT(
"RAJA requested but RAJA is not enabled!");
186 template <
typename BODY> __global__
static
187 void CuKernel1D(
const int N, BODY body)
189 const int k = blockDim.x*blockIdx.x + threadIdx.x;
190 if (k >= N) {
return; }
194 template <
typename BODY> __global__
static
195 void CuKernel2D(
const int N, BODY body,
const int BZ)
197 const int k = blockIdx.x*BZ + threadIdx.z;
198 if (k >= N) {
return; }
202 template <
typename BODY> __global__
static
203 void CuKernel3D(
const int N, BODY body)
205 const int k = blockIdx.x;
206 if (k >= N) {
return; }
210 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
213 if (N==0) {
return; }
214 const int GRID = (N+BLCK-1)/BLCK;
215 CuKernel1D<<<GRID,BLCK>>>(N, d_body);
216 MFEM_GPU_CHECK(cudaGetLastError());
219 template <
typename DBODY>
221 const int X,
const int Y,
const int BZ)
223 if (N==0) {
return; }
224 MFEM_VERIFY(BZ>0,
"");
225 const int GRID = (N+BZ-1)/BZ;
226 const dim3 BLCK(X,Y,BZ);
227 CuKernel2D<<<GRID,BLCK>>>(N,d_body,BZ);
228 MFEM_GPU_CHECK(cudaGetLastError());
231 template <
typename DBODY>
233 const int X,
const int Y,
const int Z)
235 if (N==0) {
return; }
237 const dim3 BLCK(X,Y,Z);
238 CuKernel3D<<<GRID,BLCK>>>(N,d_body);
239 MFEM_GPU_CHECK(cudaGetLastError());
242 #endif // MFEM_USE_CUDA
248 template <
typename BODY> __global__
static
249 void HipKernel1D(
const int N, BODY body)
251 const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
252 if (k >= N) {
return; }
256 template <
typename BODY> __global__
static
257 void HipKernel2D(
const int N, BODY body,
const int BZ)
259 const int k = hipBlockIdx_x*BZ + hipThreadIdx_z;
260 if (k >= N) {
return; }
264 template <
typename BODY> __global__
static
265 void HipKernel3D(
const int N, BODY body)
267 const int k = hipBlockIdx_x;
268 if (k >= N) {
return; }
272 template <const
int BLCK = MFEM_HIP_BLOCKS,
typename DBODY>
275 if (N==0) {
return; }
276 const int GRID = (N+BLCK-1)/BLCK;
277 hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
278 MFEM_GPU_CHECK(hipGetLastError());
281 template <
typename DBODY>
283 const int X,
const int Y,
const int BZ)
285 if (N==0) {
return; }
286 const int GRID = (N+BZ-1)/BZ;
287 const dim3 BLCK(X,Y,BZ);
288 hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body,BZ);
289 MFEM_GPU_CHECK(hipGetLastError());
292 template <
typename DBODY>
294 const int X,
const int Y,
const int Z)
296 if (N==0) {
return; }
298 const dim3 BLCK(X,Y,Z);
299 hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
300 MFEM_GPU_CHECK(hipGetLastError());
303 #endif // MFEM_USE_HIP
307 template <const
int DIM,
typename DBODY,
typename HBODY>
309 DBODY &&d_body, HBODY &&h_body,
310 const int X=0,
const int Y=0,
const int Z=0)
312 MFEM_CONTRACT_VAR(X);
313 MFEM_CONTRACT_VAR(Y);
314 MFEM_CONTRACT_VAR(Z);
315 MFEM_CONTRACT_VAR(d_body);
316 if (!use_dev) {
goto backend_cpu; }
318 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
333 if (
DIM == 2) {
return CuWrap2D(N, d_body, X, Y, Z); }
334 if (
DIM == 3) {
return CuWrap3D(N, d_body, X, Y, Z); }
351 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
356 #ifdef MFEM_USE_OPENMP
370 for (
int k = 0; k < N; k++) { h_body(k); }
375 #endif // MFEM_FORALL_HPP
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
void RajaOmpWrap(const int N, HBODY &&h_body)
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES...
void HipWrap1D(const int N, DBODY &&d_body)
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
void RajaCudaWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void RajaCudaWrap1D(const int N, DBODY &&d_body)
void RajaCudaWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES...
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
void CuWrap1D(const int N, DBODY &&d_body)
void ForallWrap(const bool use_dev, const int N, DBODY &&d_body, HBODY &&h_body, const int X=0, const int Y=0, const int Z=0)
The forall kernel body wrapper.
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
[device] Debug backend: host memory is READ/WRITE protected while a device is in use. It allows to test the "device" code-path (using separate host/device memory pools and host <-> device transfers) without any GPU hardware. As 'DEBUG' is sometimes used as a macro, _DEVICE has been added to avoid conflicts.
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.