12 #ifndef MFEM_FORALL_HPP
13 #define MFEM_FORALL_HPP
15 #include "../config/config.hpp"
21 #include "../linalg/dtensor.hpp"
24 #include "RAJA/RAJA.hpp"
25 #if defined(RAJA_ENABLE_CUDA) && !defined(MFEM_USE_CUDA)
26 #error When RAJA is built with CUDA, MFEM_USE_CUDA=YES is required
41 #define MFEM_FORALL(i,N,...) \
42 ForallWrap<1>(true,N, \
43 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
44 [&] (int i) {__VA_ARGS__})
47 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
48 ForallWrap<2>(true,N, \
49 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
50 [&] (int i) {__VA_ARGS__}, \
54 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
55 ForallWrap<3>(true,N, \
56 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
57 [&] (int i) {__VA_ARGS__}, \
63 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
64 ForallWrap<1>(use_dev,N, \
65 [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
66 [&] (int i) {__VA_ARGS__})
70 template <
typename HBODY>
73 #ifdef MFEM_USE_OPENMP
74 #pragma omp parallel for
75 for (
int k = 0; k < N; k++)
80 MFEM_ABORT(
"OpenMP requested for MFEM but OpenMP is not enabled!");
86 template <
int BLOCKS,
typename DBODY>
89 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
90 RAJA::forall<RAJA::cuda_exec<BLOCKS>>(RAJA::RangeSegment(0,N),d_body);
92 MFEM_ABORT(
"RAJA::Cuda requested but RAJA::Cuda is not enabled!");
98 template <
typename HBODY>
101 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
102 RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
104 MFEM_ABORT(
"RAJA::OpenMP requested but RAJA::OpenMP is not enabled!");
110 template <
typename HBODY>
114 RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
116 MFEM_ABORT(
"RAJA requested but RAJA is not enabled!");
124 template <
typename BODY> __global__
static
125 void CuKernel1D(
const int N, BODY body)
127 const int k = blockDim.x*blockIdx.x + threadIdx.x;
128 if (k >= N) {
return; }
132 template <
typename BODY> __global__
static
133 void CuKernel2D(
const int N, BODY body,
const int BZ)
135 const int k = blockIdx.x*BZ + threadIdx.z;
136 if (k >= N) {
return; }
140 template <
typename BODY> __global__
static
141 void CuKernel3D(
const int N, BODY body)
143 const int k = blockIdx.x;
144 if (k >= N) {
return; }
148 template <const
int BLCK = MFEM_CUDA_BLOCKS,
typename DBODY>
151 if (N==0) {
return; }
152 const int GRID = (N+BLCK-1)/BLCK;
153 CuKernel1D<<<GRID,BLCK>>>(N, d_body);
154 MFEM_CUDA_CHECK(cudaGetLastError());
157 template <
typename DBODY>
159 const int X,
const int Y,
const int BZ)
161 if (N==0) {
return; }
162 const int GRID = (N+BZ-1)/BZ;
163 const dim3 BLCK(X,Y,BZ);
164 CuKernel2D<<<GRID,BLCK>>>(N,d_body,BZ);
165 MFEM_CUDA_CHECK(cudaGetLastError());
168 template <
typename DBODY>
170 const int X,
const int Y,
const int Z)
172 if (N==0) {
return; }
174 const dim3 BLCK(X,Y,Z);
175 CuKernel3D<<<GRID,BLCK>>>(N,d_body);
176 MFEM_CUDA_CHECK(cudaGetLastError());
179 #endif // MFEM_USE_CUDA
183 template <const
int DIM,
typename DBODY,
typename HBODY>
185 DBODY &&d_body, HBODY &&h_body,
186 const int X=0,
const int Y=0,
const int Z=0)
188 if (!use_dev) {
goto backend_cpu; }
190 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
193 {
return RajaCudaWrap<MFEM_CUDA_BLOCKS>(N, d_body); }
202 {
return CuWrap2D(N, d_body, X, Y, Z); }
205 {
return CuWrap3D(N, d_body, X, Y, Z); }
208 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
214 #ifdef MFEM_USE_OPENMP
229 for (
int k = 0; k < N; k++) { h_body(k); }
234 #endif // MFEM_FORALL_HPP
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
void RajaCudaWrap(const int N, DBODY &&d_body)
RAJA Cuda backend.
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
Biwise-OR of all OpenMP backends.
[host] Default CPU backend: sequential execution on each MPI rank.
Biwise-OR of all CUDA backends.
Biwise-OR of all CPU backends.
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
void CuWrap1D(const int N, DBODY &&d_body)
void ForallWrap(const bool use_dev, const int N, DBODY &&d_body, HBODY &&h_body, const int X=0, const int Y=0, const int Z=0)
The forall kernel body wrapper.
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.