12 #ifndef MFEM_FORALL_HPP
13 #define MFEM_FORALL_HPP
15 #include "../config/config.hpp"
22 #include "../linalg/dtensor.hpp"
25 #include "RAJA/RAJA.hpp"
26 #if defined(RAJA_ENABLE_CUDA) && !defined(MFEM_USE_CUDA)
27 #error When RAJA is built with CUDA, MFEM_USE_CUDA=YES is required
39 #define MFEM_PRAGMA(X) _Pragma(#X)
42 #if defined(MFEM_USE_CUDA)
43 #define MFEM_UNROLL(N) MFEM_PRAGMA(unroll N)
45 #define MFEM_UNROLL(N)
52 #define MFEM_FORALL(i,N,...) \
53 ForallWrap<1>(true,N, \
54 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
55 [&] (int i) {__VA_ARGS__})
58 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
59 ForallWrap<2>(true,N, \
60 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
61 [&] (int i) {__VA_ARGS__}, \
65 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
66 ForallWrap<3>(true,N, \
67 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
68 [&] (int i) {__VA_ARGS__}, \
74 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
75 ForallWrap<1>(use_dev,N, \
76 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
77 [&] (int i) {__VA_ARGS__})
81 template <
typename HBODY>
84 #ifdef MFEM_USE_OPENMP
85 #pragma omp parallel for
86 for (
int k = 0; k < N; k++)
92 MFEM_CONTRACT_VAR(h_body);
93 MFEM_ABORT(
"OpenMP requested for MFEM but OpenMP is not enabled!");
99 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
101 using RAJA::statement::Segs;
103 template <const
int BLOCKS = MFEM_CUDA_BLOCKS,
typename DBODY>
107 RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
110 template <
typename DBODY>
112 const int X,
const int Y,
const int BZ)
114 MFEM_VERIFY(N>0,
"");
115 MFEM_VERIFY(BZ>0,
"");
116 const int G = (N+BZ-1)/BZ;
117 RAJA::kernel<RAJA::KernelPolicy<
118 RAJA::statement::CudaKernelAsync<
119 RAJA::statement::For<0, RAJA::cuda_block_x_direct,
120 RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
121 RAJA::statement::For<2, RAJA::cuda_thread_y_direct,
122 RAJA::statement::For<3, RAJA::cuda_thread_z_direct,
123 RAJA::statement::Lambda<0, Segs<0>>>>>>>>>
124 (RAJA::make_tuple(RAJA::RangeSegment(0,G), RAJA::RangeSegment(0,X),
125 RAJA::RangeSegment(0,Y), RAJA::RangeSegment(0,BZ)),
126 [=] RAJA_DEVICE (
const int n)
128 const int k = n*BZ + threadIdx.z;
129 if (k >= N) {
return; }
132 MFEM_GPU_CHECK(cudaGetLastError());
135 template <
typename DBODY>
137 const int X,
const int Y,
const int Z)
139 MFEM_VERIFY(N>0,
"");
140 RAJA::kernel<RAJA::KernelPolicy<
141 RAJA::statement::CudaKernelAsync<
142 RAJA::statement::For<0, RAJA::cuda_block_x_direct,
143 RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
144 RAJA::statement::For<2, RAJA::cuda_thread_y_direct,
145 RAJA::statement::For<3, RAJA::cuda_thread_z_direct,
146 RAJA::statement::Lambda<0, Segs<0>>>>>>>>>
147 (RAJA::make_tuple(RAJA::RangeSegment(0,N), RAJA::RangeSegment(0,X),
148 RAJA::RangeSegment(0,Y), RAJA::RangeSegment(0,Z)),
149 [=] RAJA_DEVICE (
const int k) { d_body(k); });
150 MFEM_GPU_CHECK(cudaGetLastError());
157 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
159 using RAJA::statement::Segs;
161 template <
typename HBODY>
164 RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
171 template <
typename HBODY>
175 RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
177 MFEM_CONTRACT_VAR(N);
178 MFEM_CONTRACT_VAR(h_body);
179 MFEM_ABORT(
"RAJA requested but RAJA is not enabled!");
187 template <
typename BODY> __global__
static
188 void CuKernel1D(
const int N, BODY body)
190 const int k = blockDim.x*blockIdx.x + threadIdx.x;
191 if (k >= N) {
return; }
195 template <
typename BODY> __global__
static
196 void CuKernel2D(
const int N, BODY body,
const int BZ)
198 const int k = blockIdx.x*BZ + threadIdx.z;
199 if (k >= N) {
return; }
203 template <
typename BODY> __global__
static
204 void CuKernel3D(
const int N, BODY body)
206 const int k = blockIdx.x;
207 if (k >= N) {
return; }
211 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
214 if (N==0) {
return; }
215 const int GRID = (N+BLCK-1)/BLCK;
216 CuKernel1D<<<GRID,BLCK>>>(N, d_body);
217 MFEM_GPU_CHECK(cudaGetLastError());
220 template <
typename DBODY>
222 const int X,
const int Y,
const int BZ)
224 if (N==0) {
return; }
225 MFEM_VERIFY(BZ>0,
"");
226 const int GRID = (N+BZ-1)/BZ;
227 const dim3 BLCK(X,Y,BZ);
228 CuKernel2D<<<GRID,BLCK>>>(N,d_body,BZ);
229 MFEM_GPU_CHECK(cudaGetLastError());
232 template <
typename DBODY>
234 const int X,
const int Y,
const int Z)
236 if (N==0) {
return; }
238 const dim3 BLCK(X,Y,Z);
239 CuKernel3D<<<GRID,BLCK>>>(N,d_body);
240 MFEM_GPU_CHECK(cudaGetLastError());
243 #endif // MFEM_USE_CUDA
249 template <
typename BODY> __global__
static
250 void HipKernel1D(
const int N, BODY body)
252 const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
253 if (k >= N) {
return; }
257 template <
typename BODY> __global__
static
258 void HipKernel2D(
const int N, BODY body,
const int BZ)
260 const int k = hipBlockIdx_x*BZ + hipThreadIdx_z;
261 if (k >= N) {
return; }
265 template <
typename BODY> __global__
static
266 void HipKernel3D(
const int N, BODY body)
268 const int k = hipBlockIdx_x;
269 if (k >= N) {
return; }
273 template <const
int BLCK = MFEM_HIP_BLOCKS,
typename DBODY>
276 if (N==0) {
return; }
277 const int GRID = (N+BLCK-1)/BLCK;
278 hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
279 MFEM_GPU_CHECK(hipGetLastError());
282 template <
typename DBODY>
284 const int X,
const int Y,
const int BZ)
286 if (N==0) {
return; }
287 const int GRID = (N+BZ-1)/BZ;
288 const dim3 BLCK(X,Y,BZ);
289 hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body,BZ);
290 MFEM_GPU_CHECK(hipGetLastError());
293 template <
typename DBODY>
295 const int X,
const int Y,
const int Z)
297 if (N==0) {
return; }
299 const dim3 BLCK(X,Y,Z);
300 hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
301 MFEM_GPU_CHECK(hipGetLastError());
304 #endif // MFEM_USE_HIP
308 template <const
int DIM,
typename DBODY,
typename HBODY>
310 DBODY &&d_body, HBODY &&h_body,
311 const int X=0,
const int Y=0,
const int Z=0)
313 MFEM_CONTRACT_VAR(X);
314 MFEM_CONTRACT_VAR(Y);
315 MFEM_CONTRACT_VAR(Z);
316 MFEM_CONTRACT_VAR(d_body);
317 if (!use_dev) {
goto backend_cpu; }
319 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
337 {
return CuWrap2D(N, d_body, X, Y, Z); }
340 {
return CuWrap3D(N, d_body, X, Y, Z); }
349 {
return HipWrap2D(N, d_body, X, Y, Z); }
352 {
return HipWrap3D(N, d_body, X, Y, Z); }
357 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
363 #ifdef MFEM_USE_OPENMP
378 for (
int k = 0; k < N; k++) { h_body(k); }
383 #endif // MFEM_FORALL_HPP
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
void RajaOmpWrap(const int N, HBODY &&h_body)
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
Biwise-OR of all HIP backends.
void HipWrap1D(const int N, DBODY &&d_body)
void RajaCudaWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void RajaCudaWrap1D(const int N, DBODY &&d_body)
Biwise-OR of all OpenMP backends.
void RajaCudaWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
[device] Debug backend: host memory is READ/WRITE protected while a device is in use. It allows to test the "device" code-path (using separate host/device memory pools and host <-> device transfers) without any GPU hardware.
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
[host] Default CPU backend: sequential execution on each MPI rank.
Biwise-OR of all CUDA backends.
Biwise-OR of all CPU backends.
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
void CuWrap1D(const int N, DBODY &&d_body)
void ForallWrap(const bool use_dev, const int N, DBODY &&d_body, HBODY &&h_body, const int X=0, const int Y=0, const int Z=0)
The forall kernel body wrapper.
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.