MFEM  v4.3.0
Finite element discretization library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Pages
forall.hpp
Go to the documentation of this file.
1 // Copyright (c) 2010-2021, Lawrence Livermore National Security, LLC. Produced
2 // at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3 // LICENSE and NOTICE for details. LLNL-CODE-806117.
4 //
5 // This file is part of the MFEM library. For more information and source code
6 // availability visit https://mfem.org.
7 //
8 // MFEM is free software; you can redistribute it and/or modify it under the
9 // terms of the BSD-3 license. We welcome feedback and contributions, see file
10 // CONTRIBUTING.md for details.
11 
12 #ifndef MFEM_FORALL_HPP
13 #define MFEM_FORALL_HPP
14 
15 #include "../config/config.hpp"
16 #include "error.hpp"
17 #include "backends.hpp"
18 #include "device.hpp"
19 #include "mem_manager.hpp"
20 #include "../linalg/dtensor.hpp"
21 
22 namespace mfem
23 {
24 
25 // Maximum size of dofs and quads in 1D.
26 #ifdef MFEM_USE_HIP
27 const int MAX_D1D = 11;
28 const int MAX_Q1D = 11;
29 #else
30 const int MAX_D1D = 14;
31 const int MAX_Q1D = 14;
32 #endif
33 
34 // MFEM pragma macros that can be used inside MFEM_FORALL macros.
35 #define MFEM_PRAGMA(X) _Pragma(#X)
36 
37 // MFEM_UNROLL pragma macro that can be used inside MFEM_FORALL macros.
38 #if defined(MFEM_USE_CUDA)
39 #define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N))
40 #else
41 #define MFEM_UNROLL(N)
42 #endif
43 
44 // Implementation of MFEM's "parallel for" (forall) device/host kernel
45 // interfaces supporting RAJA, CUDA, OpenMP, and sequential backends.
46 
47 // The MFEM_FORALL wrapper
48 #define MFEM_FORALL(i,N,...) \
49  ForallWrap<1>(true,N, \
50  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
51  [&] MFEM_LAMBDA (int i) {__VA_ARGS__})
52 
53 // MFEM_FORALL with a 2D CUDA block
54 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
55  ForallWrap<2>(true,N, \
56  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
57  [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
58  X,Y,BZ)
59 
60 // MFEM_FORALL with a 3D CUDA block
61 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
62  ForallWrap<3>(true,N, \
63  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
64  [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
65  X,Y,Z)
66 
67 // MFEM_FORALL with a 3D CUDA block and grid
68 // With G=0, this is the same as MFEM_FORALL_3D(i,N,X,Y,Z,...)
69 #define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \
70  ForallWrap<3>(true,N, \
71  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
72  [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
73  X,Y,Z,G)
74 
75 // MFEM_FORALL that uses the basic CPU backend when use_dev is false. See for
76 // example the functions in vector.cpp, where we don't want to use the mfem
77 // device for operations on small vectors.
78 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
79  ForallWrap<1>(use_dev,N, \
80  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
81  [&] MFEM_LAMBDA (int i) {__VA_ARGS__})
82 
83 
84 /// OpenMP backend
85 template <typename HBODY>
86 void OmpWrap(const int N, HBODY &&h_body)
87 {
88 #ifdef MFEM_USE_OPENMP
89  #pragma omp parallel for
90  for (int k = 0; k < N; k++)
91  {
92  h_body(k);
93  }
94 #else
95  MFEM_CONTRACT_VAR(N);
96  MFEM_CONTRACT_VAR(h_body);
97  MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");
98 #endif
99 }
100 
101 
102 /// RAJA Cuda and Hip backends
103 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
104 using cuda_launch_policy =
105  RAJA::expt::LaunchPolicy<RAJA::expt::null_launch_t, RAJA::expt::cuda_launch_t<true>>;
106 using cuda_teams_x =
107  RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_block_x_direct>;
108 using cuda_threads_z =
109  RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::cuda_thread_z_direct>;
110 #endif
111 
112 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
113 using hip_launch_policy =
114  RAJA::expt::LaunchPolicy<RAJA::expt::null_launch_t, RAJA::expt::hip_launch_t<true>>;
115 using hip_teams_x =
116  RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_block_x_direct>;
117 using hip_threads_z =
118  RAJA::expt::LoopPolicy<RAJA::loop_exec,RAJA::hip_thread_z_direct>;
119 #endif
120 
121 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
122 template <const int BLOCKS = MFEM_CUDA_BLOCKS, typename DBODY>
123 void RajaCuWrap1D(const int N, DBODY &&d_body)
124 {
125  //true denotes asynchronous kernel
126  RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
127 }
128 
129 template <typename DBODY>
130 void RajaCuWrap2D(const int N, DBODY &&d_body,
131  const int X, const int Y, const int BZ)
132 {
133  MFEM_VERIFY(N>0, "");
134  MFEM_VERIFY(BZ>0, "");
135  const int G = (N+BZ-1)/BZ;
136 
137  using namespace RAJA::expt;
138  using RAJA::RangeSegment;
139 
140  launch<cuda_launch_policy>
141  (DEVICE, Resources(Teams(G), Threads(X, Y, BZ)),
142  [=] RAJA_DEVICE (LaunchContext ctx)
143  {
144 
145  loop<cuda_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
146  {
147 
148  loop<cuda_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
149  {
150 
151  const int k = n*BZ + tz;
152  if (k >= N) { return; }
153  d_body(k);
154 
155  });
156 
157  });
158 
159  });
160 
161  MFEM_GPU_CHECK(cudaGetLastError());
162 }
163 
164 template <typename DBODY>
165 void RajaCuWrap3D(const int N, DBODY &&d_body,
166  const int X, const int Y, const int Z, const int G)
167 {
168  MFEM_VERIFY(N>0, "");
169  const int GRID = G == 0 ? N : G;
170  using namespace RAJA::expt;
171  using RAJA::RangeSegment;
172 
173  launch<cuda_launch_policy>
174  (DEVICE, Resources(Teams(GRID), Threads(X, Y, Z)),
175  [=] RAJA_DEVICE (LaunchContext ctx)
176  {
177 
178  loop<cuda_teams_x>(ctx, RangeSegment(0, N), d_body);
179 
180  });
181 
182  MFEM_GPU_CHECK(cudaGetLastError());
183 }
184 
185 #endif
186 
187 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
188 template <const int BLOCKS = MFEM_HIP_BLOCKS, typename DBODY>
189 void RajaHipWrap1D(const int N, DBODY &&d_body)
190 {
191  //true denotes asynchronous kernel
192  RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
193 }
194 
195 template <typename DBODY>
196 void RajaHipWrap2D(const int N, DBODY &&d_body,
197  const int X, const int Y, const int BZ)
198 {
199  MFEM_VERIFY(N>0, "");
200  MFEM_VERIFY(BZ>0, "");
201  const int G = (N+BZ-1)/BZ;
202 
203  using namespace RAJA::expt;
204  using RAJA::RangeSegment;
205 
206  launch<hip_launch_policy>
207  (DEVICE, Resources(Teams(G), Threads(X, Y, BZ)),
208  [=] RAJA_DEVICE (LaunchContext ctx)
209  {
210 
211  loop<hip_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
212  {
213 
214  loop<hip_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
215  {
216 
217  const int k = n*BZ + tz;
218  if (k >= N) { return; }
219  d_body(k);
220 
221  });
222 
223  });
224 
225  });
226 
227  MFEM_GPU_CHECK(hipGetLastError());
228 }
229 
230 template <typename DBODY>
231 void RajaHipWrap3D(const int N, DBODY &&d_body,
232  const int X, const int Y, const int Z, const int G)
233 {
234  MFEM_VERIFY(N>0, "");
235  const int GRID = G == 0 ? N : G;
236  using namespace RAJA::expt;
237  using RAJA::RangeSegment;
238 
239  launch<hip_launch_policy>
240  (DEVICE, Resources(Teams(GRID), Threads(X, Y, Z)),
241  [=] RAJA_DEVICE (LaunchContext ctx)
242  {
243 
244  loop<hip_teams_x>(ctx, RangeSegment(0, N), d_body);
245 
246  });
247 
248  MFEM_GPU_CHECK(hipGetLastError());
249 }
250 #endif
251 
252 /// RAJA OpenMP backend
253 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
254 
255 template <typename HBODY>
256 void RajaOmpWrap(const int N, HBODY &&h_body)
257 {
258  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
259 }
260 
261 #endif
262 
263 
264 /// RAJA sequential loop backend
265 template <typename HBODY>
266 void RajaSeqWrap(const int N, HBODY &&h_body)
267 {
268 #ifdef MFEM_USE_RAJA
269  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
270 #else
271  MFEM_CONTRACT_VAR(N);
272  MFEM_CONTRACT_VAR(h_body);
273  MFEM_ABORT("RAJA requested but RAJA is not enabled!");
274 #endif
275 }
276 
277 
278 /// CUDA backend
279 #ifdef MFEM_USE_CUDA
280 
281 template <typename BODY> __global__ static
282 void CuKernel1D(const int N, BODY body)
283 {
284  const int k = blockDim.x*blockIdx.x + threadIdx.x;
285  if (k >= N) { return; }
286  body(k);
287 }
288 
289 template <typename BODY> __global__ static
290 void CuKernel2D(const int N, BODY body)
291 {
292  const int k = blockIdx.x*blockDim.z + threadIdx.z;
293  if (k >= N) { return; }
294  body(k);
295 }
296 
297 template <typename BODY> __global__ static
298 void CuKernel3D(const int N, BODY body)
299 {
300  for (int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }
301 }
302 
303 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
304 void CuWrap1D(const int N, DBODY &&d_body)
305 {
306  if (N==0) { return; }
307  const int GRID = (N+BLCK-1)/BLCK;
308  CuKernel1D<<<GRID,BLCK>>>(N, d_body);
309  MFEM_GPU_CHECK(cudaGetLastError());
310 }
311 
312 template <typename DBODY>
313 void CuWrap2D(const int N, DBODY &&d_body,
314  const int X, const int Y, const int BZ)
315 {
316  if (N==0) { return; }
317  MFEM_VERIFY(BZ>0, "");
318  const int GRID = (N+BZ-1)/BZ;
319  const dim3 BLCK(X,Y,BZ);
320  CuKernel2D<<<GRID,BLCK>>>(N,d_body);
321  MFEM_GPU_CHECK(cudaGetLastError());
322 }
323 
324 template <typename DBODY>
325 void CuWrap3D(const int N, DBODY &&d_body,
326  const int X, const int Y, const int Z, const int G)
327 {
328  if (N==0) { return; }
329  const int GRID = G == 0 ? N : G;
330  const dim3 BLCK(X,Y,Z);
331  CuKernel3D<<<GRID,BLCK>>>(N,d_body);
332  MFEM_GPU_CHECK(cudaGetLastError());
333 }
334 
335 #endif // MFEM_USE_CUDA
336 
337 
338 /// HIP backend
339 #ifdef MFEM_USE_HIP
340 
341 template <typename BODY> __global__ static
342 void HipKernel1D(const int N, BODY body)
343 {
344  const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
345  if (k >= N) { return; }
346  body(k);
347 }
348 
349 template <typename BODY> __global__ static
350 void HipKernel2D(const int N, BODY body)
351 {
352  const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;
353  if (k >= N) { return; }
354  body(k);
355 }
356 
357 template <typename BODY> __global__ static
358 void HipKernel3D(const int N, BODY body)
359 {
360  for (int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }
361 }
362 
363 template <const int BLCK = MFEM_HIP_BLOCKS, typename DBODY>
364 void HipWrap1D(const int N, DBODY &&d_body)
365 {
366  if (N==0) { return; }
367  const int GRID = (N+BLCK-1)/BLCK;
368  hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
369  MFEM_GPU_CHECK(hipGetLastError());
370 }
371 
372 template <typename DBODY>
373 void HipWrap2D(const int N, DBODY &&d_body,
374  const int X, const int Y, const int BZ)
375 {
376  if (N==0) { return; }
377  const int GRID = (N+BZ-1)/BZ;
378  const dim3 BLCK(X,Y,BZ);
379  hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body);
380  MFEM_GPU_CHECK(hipGetLastError());
381 }
382 
383 template <typename DBODY>
384 void HipWrap3D(const int N, DBODY &&d_body,
385  const int X, const int Y, const int Z, const int G)
386 {
387  if (N==0) { return; }
388  const int GRID = G == 0 ? N : G;
389  const dim3 BLCK(X,Y,Z);
390  hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
391  MFEM_GPU_CHECK(hipGetLastError());
392 }
393 
394 #endif // MFEM_USE_HIP
395 
396 
397 /// The forall kernel body wrapper
398 template <const int DIM, typename DBODY, typename HBODY>
399 inline void ForallWrap(const bool use_dev, const int N,
400  DBODY &&d_body, HBODY &&h_body,
401  const int X=0, const int Y=0, const int Z=0,
402  const int G=0)
403 {
404  MFEM_CONTRACT_VAR(X);
405  MFEM_CONTRACT_VAR(Y);
406  MFEM_CONTRACT_VAR(Z);
407  MFEM_CONTRACT_VAR(G);
408  MFEM_CONTRACT_VAR(d_body);
409  if (!use_dev) { goto backend_cpu; }
410 
411 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
412  // If Backend::RAJA_CUDA is allowed, use it
414  {
415  if (DIM == 1) { return RajaCuWrap1D(N, d_body); }
416  if (DIM == 2) { return RajaCuWrap2D(N, d_body, X, Y, Z); }
417  if (DIM == 3) { return RajaCuWrap3D(N, d_body, X, Y, Z, G); }
418  }
419 #endif
420 
421 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
422  // If Backend::RAJA_HIP is allowed, use it
424  {
425  if (DIM == 1) { return RajaHipWrap1D(N, d_body); }
426  if (DIM == 2) { return RajaHipWrap2D(N, d_body, X, Y, Z); }
427  if (DIM == 3) { return RajaHipWrap3D(N, d_body, X, Y, Z, G); }
428  }
429 #endif
430 
431 #ifdef MFEM_USE_CUDA
432  // If Backend::CUDA is allowed, use it
434  {
435  if (DIM == 1) { return CuWrap1D(N, d_body); }
436  if (DIM == 2) { return CuWrap2D(N, d_body, X, Y, Z); }
437  if (DIM == 3) { return CuWrap3D(N, d_body, X, Y, Z, G); }
438  }
439 #endif
440 
441 #ifdef MFEM_USE_HIP
442  // If Backend::HIP is allowed, use it
444  {
445  if (DIM == 1) { return HipWrap1D(N, d_body); }
446  if (DIM == 2) { return HipWrap2D(N, d_body, X, Y, Z); }
447  if (DIM == 3) { return HipWrap3D(N, d_body, X, Y, Z, G); }
448  }
449 #endif
450 
451  // If Backend::DEBUG_DEVICE is allowed, use it
452  if (Device::Allows(Backend::DEBUG_DEVICE)) { goto backend_cpu; }
453 
454 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
455  // If Backend::RAJA_OMP is allowed, use it
456  if (Device::Allows(Backend::RAJA_OMP)) { return RajaOmpWrap(N, h_body); }
457 #endif
458 
459 #ifdef MFEM_USE_OPENMP
460  // If Backend::OMP is allowed, use it
461  if (Device::Allows(Backend::OMP)) { return OmpWrap(N, h_body); }
462 #endif
463 
464 #ifdef MFEM_USE_RAJA
465  // If Backend::RAJA_CPU is allowed, use it
466  if (Device::Allows(Backend::RAJA_CPU)) { return RajaSeqWrap(N, h_body); }
467 #endif
468 
469 backend_cpu:
470  // Handle Backend::CPU. This is also a fallback for any allowed backends not
471  // handled above, e.g. OCCA_CPU with configuration 'occa-cpu,cpu', or
472  // OCCA_OMP with configuration 'occa-omp,cpu'.
473  for (int k = 0; k < N; k++) { h_body(k); }
474 }
475 
476 } // namespace mfem
477 
478 #endif // MFEM_FORALL_HPP
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:325
RAJA::expt::LaunchPolicy< RAJA::expt::null_launch_t, RAJA::expt::hip_launch_t< true >> hip_launch_policy
Definition: forall.hpp:114
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:384
void ForallWrap(const bool use_dev, const int N, DBODY &&d_body, HBODY &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
Definition: forall.hpp:399
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Definition: forall.hpp:266
Device memory; using CUDA or HIP *Malloc and *Free.
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
Definition: forall.hpp:256
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:313
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES...
Definition: device.hpp:45
void HipWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:364
constexpr int DIM
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:165
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
Definition: device.hpp:48
RAJA::expt::LoopPolicy< RAJA::loop_exec, RAJA::cuda_thread_z_direct > cuda_threads_z
Definition: forall.hpp:109
RAJA::expt::LaunchPolicy< RAJA::expt::null_launch_t, RAJA::expt::cuda_launch_t< true >> cuda_launch_policy
RAJA Cuda and Hip backends.
Definition: forall.hpp:105
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:231
RAJA::expt::LoopPolicy< RAJA::loop_exec, RAJA::hip_thread_z_direct > hip_threads_z
Definition: forall.hpp:118
const int MAX_Q1D
Definition: forall.hpp:28
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES...
Definition: device.hpp:42
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
Definition: device.hpp:35
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition: device.hpp:258
void CuWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:304
RAJA::expt::LoopPolicy< RAJA::loop_exec, RAJA::hip_block_x_direct > hip_teams_x
Definition: forall.hpp:116
const int MAX_D1D
Definition: forall.hpp:27
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:373
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:196
RAJA::expt::LoopPolicy< RAJA::loop_exec, RAJA::cuda_block_x_direct > cuda_teams_x
Definition: forall.hpp:107
void RajaCuWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:123
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:130
void RajaHipWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:189
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
Definition: device.hpp:51
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
Definition: device.hpp:39
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
Definition: device.hpp:37
[device] Debug backend: host memory is READ/WRITE protected while a device is in use. It allows to test the &quot;device&quot; code-path (using separate host/device memory pools and host &lt;-&gt; device transfers) without any GPU hardware. As &#39;DEBUG&#39; is sometimes used as a macro, _DEVICE has been added to avoid conflicts.
Definition: device.hpp:75
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.
Definition: forall.hpp:86