MFEM  v4.2.0
Finite element discretization library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Pages
forall.hpp
Go to the documentation of this file.
1 // Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
2 // at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3 // LICENSE and NOTICE for details. LLNL-CODE-806117.
4 //
5 // This file is part of the MFEM library. For more information and source code
6 // availability visit https://mfem.org.
7 //
8 // MFEM is free software; you can redistribute it and/or modify it under the
9 // terms of the BSD-3 license. We welcome feedback and contributions, see file
10 // CONTRIBUTING.md for details.
11 
12 #ifndef MFEM_FORALL_HPP
13 #define MFEM_FORALL_HPP
14 
15 #include "../config/config.hpp"
16 #include "error.hpp"
17 #include "backends.hpp"
18 #include "device.hpp"
19 #include "mem_manager.hpp"
20 #include "../linalg/dtensor.hpp"
21 
22 namespace mfem
23 {
24 
25 // Maximum size of dofs and quads in 1D.
26 const int MAX_D1D = 14;
27 const int MAX_Q1D = 14;
28 
29 // MFEM pragma macros that can be used inside MFEM_FORALL macros.
30 #define MFEM_PRAGMA(X) _Pragma(#X)
31 
32 // MFEM_UNROLL pragma macro that can be used inside MFEM_FORALL macros.
33 #if defined(MFEM_USE_CUDA)
34 #define MFEM_UNROLL(N) MFEM_PRAGMA(unroll N)
35 #else
36 #define MFEM_UNROLL(N)
37 #endif
38 
39 // Implementation of MFEM's "parallel for" (forall) device/host kernel
40 // interfaces supporting RAJA, CUDA, OpenMP, and sequential backends.
41 
42 // The MFEM_FORALL wrapper
43 #define MFEM_FORALL(i,N,...) \
44  ForallWrap<1>(true,N, \
45  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
46  [&] MFEM_LAMBDA (int i) {__VA_ARGS__})
47 
48 // MFEM_FORALL with a 2D CUDA block
49 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
50  ForallWrap<2>(true,N, \
51  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
52  [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
53  X,Y,BZ)
54 
55 // MFEM_FORALL with a 3D CUDA block
56 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
57  ForallWrap<3>(true,N, \
58  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
59  [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
60  X,Y,Z)
61 
62 // MFEM_FORALL that uses the basic CPU backend when use_dev is false. See for
63 // example the functions in vector.cpp, where we don't want to use the mfem
64 // device for operations on small vectors.
65 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
66  ForallWrap<1>(use_dev,N, \
67  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
68  [&] MFEM_LAMBDA (int i) {__VA_ARGS__})
69 
70 
71 /// OpenMP backend
72 template <typename HBODY>
73 void OmpWrap(const int N, HBODY &&h_body)
74 {
75 #ifdef MFEM_USE_OPENMP
76  #pragma omp parallel for
77  for (int k = 0; k < N; k++)
78  {
79  h_body(k);
80  }
81 #else
82  MFEM_CONTRACT_VAR(N);
83  MFEM_CONTRACT_VAR(h_body);
84  MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");
85 #endif
86 }
87 
88 
89 /// RAJA Cuda backend
90 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
91 
92 #if RAJA_VERSION_MAJOR == 0 && RAJA_VERSION_MINOR < 12
93 using RAJA::statement::Segs;
94 #else
95 using RAJA::Segs;
96 #endif
97 
98 template <const int BLOCKS = MFEM_CUDA_BLOCKS, typename DBODY>
99 void RajaCudaWrap1D(const int N, DBODY &&d_body)
100 {
101  // true denotes asynchronous kernel
102  RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
103 }
104 
105 template <typename DBODY>
106 void RajaCudaWrap2D(const int N, DBODY &&d_body,
107  const int X, const int Y, const int BZ)
108 {
109  MFEM_VERIFY(N>0, "");
110  MFEM_VERIFY(BZ>0, "");
111  const int G = (N+BZ-1)/BZ;
112  RAJA::kernel<RAJA::KernelPolicy<
113  RAJA::statement::CudaKernelAsync<
114  RAJA::statement::For<0, RAJA::cuda_block_x_direct,
115  RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
116  RAJA::statement::For<2, RAJA::cuda_thread_y_direct,
117  RAJA::statement::For<3, RAJA::cuda_thread_z_direct,
118  RAJA::statement::Lambda<0, Segs<0>>>>>>>>>
119  (RAJA::make_tuple(RAJA::RangeSegment(0,G), RAJA::RangeSegment(0,X),
120  RAJA::RangeSegment(0,Y), RAJA::RangeSegment(0,BZ)),
121  [=] RAJA_DEVICE (const int n)
122  {
123  const int k = n*BZ + threadIdx.z;
124  if (k >= N) { return; }
125  d_body(k);
126  });
127  MFEM_GPU_CHECK(cudaGetLastError());
128 }
129 
130 template <typename DBODY>
131 void RajaCudaWrap3D(const int N, DBODY &&d_body,
132  const int X, const int Y, const int Z)
133 {
134  MFEM_VERIFY(N>0, "");
135  RAJA::kernel<RAJA::KernelPolicy<
136  RAJA::statement::CudaKernelAsync<
137  RAJA::statement::For<0, RAJA::cuda_block_x_direct,
138  RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
139  RAJA::statement::For<2, RAJA::cuda_thread_y_direct,
140  RAJA::statement::For<3, RAJA::cuda_thread_z_direct,
141  RAJA::statement::Lambda<0, Segs<0>>>>>>>>>
142  (RAJA::make_tuple(RAJA::RangeSegment(0,N), RAJA::RangeSegment(0,X),
143  RAJA::RangeSegment(0,Y), RAJA::RangeSegment(0,Z)),
144  [=] RAJA_DEVICE (const int k) { d_body(k); });
145  MFEM_GPU_CHECK(cudaGetLastError());
146 }
147 
148 #endif
149 
150 
151 /// RAJA OpenMP backend
152 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
153 
154 #if RAJA_VERSION_MAJOR == 0 && RAJA_VERSION_MINOR < 12
155 using RAJA::statement::Segs;
156 #else
157 using RAJA::Segs;
158 #endif
159 
160 template <typename HBODY>
161 void RajaOmpWrap(const int N, HBODY &&h_body)
162 {
163  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
164 }
165 
166 #endif
167 
168 
169 /// RAJA sequential loop backend
170 template <typename HBODY>
171 void RajaSeqWrap(const int N, HBODY &&h_body)
172 {
173 #ifdef MFEM_USE_RAJA
174  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
175 #else
176  MFEM_CONTRACT_VAR(N);
177  MFEM_CONTRACT_VAR(h_body);
178  MFEM_ABORT("RAJA requested but RAJA is not enabled!");
179 #endif
180 }
181 
182 
183 /// CUDA backend
184 #ifdef MFEM_USE_CUDA
185 
186 template <typename BODY> __global__ static
187 void CuKernel1D(const int N, BODY body)
188 {
189  const int k = blockDim.x*blockIdx.x + threadIdx.x;
190  if (k >= N) { return; }
191  body(k);
192 }
193 
194 template <typename BODY> __global__ static
195 void CuKernel2D(const int N, BODY body, const int BZ)
196 {
197  const int k = blockIdx.x*BZ + threadIdx.z;
198  if (k >= N) { return; }
199  body(k);
200 }
201 
202 template <typename BODY> __global__ static
203 void CuKernel3D(const int N, BODY body)
204 {
205  const int k = blockIdx.x;
206  if (k >= N) { return; }
207  body(k);
208 }
209 
210 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
211 void CuWrap1D(const int N, DBODY &&d_body)
212 {
213  if (N==0) { return; }
214  const int GRID = (N+BLCK-1)/BLCK;
215  CuKernel1D<<<GRID,BLCK>>>(N, d_body);
216  MFEM_GPU_CHECK(cudaGetLastError());
217 }
218 
219 template <typename DBODY>
220 void CuWrap2D(const int N, DBODY &&d_body,
221  const int X, const int Y, const int BZ)
222 {
223  if (N==0) { return; }
224  MFEM_VERIFY(BZ>0, "");
225  const int GRID = (N+BZ-1)/BZ;
226  const dim3 BLCK(X,Y,BZ);
227  CuKernel2D<<<GRID,BLCK>>>(N,d_body,BZ);
228  MFEM_GPU_CHECK(cudaGetLastError());
229 }
230 
231 template <typename DBODY>
232 void CuWrap3D(const int N, DBODY &&d_body,
233  const int X, const int Y, const int Z)
234 {
235  if (N==0) { return; }
236  const int GRID = N;
237  const dim3 BLCK(X,Y,Z);
238  CuKernel3D<<<GRID,BLCK>>>(N,d_body);
239  MFEM_GPU_CHECK(cudaGetLastError());
240 }
241 
242 #endif // MFEM_USE_CUDA
243 
244 
245 /// HIP backend
246 #ifdef MFEM_USE_HIP
247 
248 template <typename BODY> __global__ static
249 void HipKernel1D(const int N, BODY body)
250 {
251  const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
252  if (k >= N) { return; }
253  body(k);
254 }
255 
256 template <typename BODY> __global__ static
257 void HipKernel2D(const int N, BODY body, const int BZ)
258 {
259  const int k = hipBlockIdx_x*BZ + hipThreadIdx_z;
260  if (k >= N) { return; }
261  body(k);
262 }
263 
264 template <typename BODY> __global__ static
265 void HipKernel3D(const int N, BODY body)
266 {
267  const int k = hipBlockIdx_x;
268  if (k >= N) { return; }
269  body(k);
270 }
271 
272 template <const int BLCK = MFEM_HIP_BLOCKS, typename DBODY>
273 void HipWrap1D(const int N, DBODY &&d_body)
274 {
275  if (N==0) { return; }
276  const int GRID = (N+BLCK-1)/BLCK;
277  hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
278  MFEM_GPU_CHECK(hipGetLastError());
279 }
280 
281 template <typename DBODY>
282 void HipWrap2D(const int N, DBODY &&d_body,
283  const int X, const int Y, const int BZ)
284 {
285  if (N==0) { return; }
286  const int GRID = (N+BZ-1)/BZ;
287  const dim3 BLCK(X,Y,BZ);
288  hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body,BZ);
289  MFEM_GPU_CHECK(hipGetLastError());
290 }
291 
292 template <typename DBODY>
293 void HipWrap3D(const int N, DBODY &&d_body,
294  const int X, const int Y, const int Z)
295 {
296  if (N==0) { return; }
297  const int GRID = N;
298  const dim3 BLCK(X,Y,Z);
299  hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
300  MFEM_GPU_CHECK(hipGetLastError());
301 }
302 
303 #endif // MFEM_USE_HIP
304 
305 
306 /// The forall kernel body wrapper
307 template <const int DIM, typename DBODY, typename HBODY>
308 inline void ForallWrap(const bool use_dev, const int N,
309  DBODY &&d_body, HBODY &&h_body,
310  const int X=0, const int Y=0, const int Z=0)
311 {
312  MFEM_CONTRACT_VAR(X);
313  MFEM_CONTRACT_VAR(Y);
314  MFEM_CONTRACT_VAR(Z);
315  MFEM_CONTRACT_VAR(d_body);
316  if (!use_dev) { goto backend_cpu; }
317 
318 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
319  // If Backend::RAJA_CUDA is allowed, use it
321  {
322  if (DIM == 1) { return RajaCudaWrap1D(N, d_body); }
323  if (DIM == 2) { return RajaCudaWrap2D(N, d_body, X, Y, Z); }
324  if (DIM == 3) { return RajaCudaWrap3D(N, d_body, X, Y, Z); }
325  }
326 #endif
327 
328 #ifdef MFEM_USE_CUDA
329  // If Backend::CUDA is allowed, use it
331  {
332  if (DIM == 1) { return CuWrap1D(N, d_body); }
333  if (DIM == 2) { return CuWrap2D(N, d_body, X, Y, Z); }
334  if (DIM == 3) { return CuWrap3D(N, d_body, X, Y, Z); }
335  }
336 #endif
337 
338 #ifdef MFEM_USE_HIP
339  // If Backend::HIP is allowed, use it
341  {
342  if (DIM == 1) { return HipWrap1D(N, d_body); }
343  if (DIM == 2) { return HipWrap2D(N, d_body, X, Y, Z); }
344  if (DIM == 3) { return HipWrap3D(N, d_body, X, Y, Z); }
345  }
346 #endif
347 
348  // If Backend::DEBUG_DEVICE is allowed, use it
349  if (Device::Allows(Backend::DEBUG_DEVICE)) { goto backend_cpu; }
350 
351 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
352  // If Backend::RAJA_OMP is allowed, use it
353  if (Device::Allows(Backend::RAJA_OMP)) { return RajaOmpWrap(N, h_body); }
354 #endif
355 
356 #ifdef MFEM_USE_OPENMP
357  // If Backend::OMP is allowed, use it
358  if (Device::Allows(Backend::OMP)) { return OmpWrap(N, h_body); }
359 #endif
360 
361 #ifdef MFEM_USE_RAJA
362  // If Backend::RAJA_CPU is allowed, use it
363  if (Device::Allows(Backend::RAJA_CPU)) { return RajaSeqWrap(N, h_body); }
364 #endif
365 
366 backend_cpu:
367  // Handle Backend::CPU. This is also a fallback for any allowed backends not
368  // handled above, e.g. OCCA_CPU with configuration 'occa-cpu,cpu', or
369  // OCCA_OMP with configuration 'occa-omp,cpu'.
370  for (int k = 0; k < N; k++) { h_body(k); }
371 }
372 
373 } // namespace mfem
374 
375 #endif // MFEM_FORALL_HPP
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Definition: forall.hpp:171
void RajaOmpWrap(const int N, HBODY &&h_body)
Definition: forall.hpp:161
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:220
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
Definition: forall.hpp:232
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES...
Definition: device.hpp:45
void HipWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:273
constexpr int DIM
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
Definition: device.hpp:48
void RajaCudaWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:106
const int MAX_Q1D
Definition: forall.hpp:27
void RajaCudaWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:99
void RajaCudaWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
Definition: forall.hpp:131
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES...
Definition: device.hpp:42
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
Definition: forall.hpp:293
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
Definition: device.hpp:35
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition: device.hpp:246
void CuWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:211
void ForallWrap(const bool use_dev, const int N, DBODY &&d_body, HBODY &&h_body, const int X=0, const int Y=0, const int Z=0)
The forall kernel body wrapper.
Definition: forall.hpp:308
const int MAX_D1D
Definition: forall.hpp:26
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:282
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
Definition: device.hpp:39
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
Definition: device.hpp:37
[device] Debug backend: host memory is READ/WRITE protected while a device is in use. It allows to test the &quot;device&quot; code-path (using separate host/device memory pools and host &lt;-&gt; device transfers) without any GPU hardware. As &#39;DEBUG&#39; is sometimes used as a macro, _DEVICE has been added to avoid conflicts.
Definition: device.hpp:72
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.
Definition: forall.hpp:73