MFEM  v4.1.0
Finite element discretization library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Pages
forall.hpp
Go to the documentation of this file.
1 // Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
2 // at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3 // LICENSE and NOTICE for details. LLNL-CODE-806117.
4 //
5 // This file is part of the MFEM library. For more information and source code
6 // availability visit https://mfem.org.
7 //
8 // MFEM is free software; you can redistribute it and/or modify it under the
9 // terms of the BSD-3 license. We welcome feedback and contributions, see file
10 // CONTRIBUTING.md for details.
11 
12 #ifndef MFEM_FORALL_HPP
13 #define MFEM_FORALL_HPP
14 
15 #include "../config/config.hpp"
16 #include "error.hpp"
17 #include "cuda.hpp"
18 #include "hip.hpp"
19 #include "occa.hpp"
20 #include "device.hpp"
21 #include "mem_manager.hpp"
22 #include "../linalg/dtensor.hpp"
23 
24 #ifdef MFEM_USE_RAJA
25 #include "RAJA/RAJA.hpp"
26 #if defined(RAJA_ENABLE_CUDA) && !defined(MFEM_USE_CUDA)
27 #error When RAJA is built with CUDA, MFEM_USE_CUDA=YES is required
28 #endif
29 #endif
30 
31 namespace mfem
32 {
33 
34 // Maximum size of dofs and quads in 1D.
35 const int MAX_D1D = 14;
36 const int MAX_Q1D = 14;
37 
38 // MFEM pragma macros that can be used inside MFEM_FORALL macros.
39 #define MFEM_PRAGMA(X) _Pragma(#X)
40 
41 // MFEM_UNROLL pragma macro that can be used inside MFEM_FORALL macros.
42 #if defined(MFEM_USE_CUDA)
43 #define MFEM_UNROLL(N) MFEM_PRAGMA(unroll N)
44 #else
45 #define MFEM_UNROLL(N)
46 #endif
47 
48 // Implementation of MFEM's "parallel for" (forall) device/host kernel
49 // interfaces supporting RAJA, CUDA, OpenMP, and sequential backends.
50 
51 // The MFEM_FORALL wrapper
52 #define MFEM_FORALL(i,N,...) \
53  ForallWrap<1>(true,N, \
54  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
55  [&] (int i) {__VA_ARGS__})
56 
57 // MFEM_FORALL with a 2D CUDA block
58 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
59  ForallWrap<2>(true,N, \
60  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
61  [&] (int i) {__VA_ARGS__}, \
62  X,Y,BZ)
63 
64 // MFEM_FORALL with a 3D CUDA block
65 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
66  ForallWrap<3>(true,N, \
67  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
68  [&] (int i) {__VA_ARGS__}, \
69  X,Y,Z)
70 
71 // MFEM_FORALL that uses the basic CPU backend when use_dev is false. See for
72 // example the functions in vector.cpp, where we don't want to use the mfem
73 // device for operations on small vectors.
74 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
75  ForallWrap<1>(use_dev,N, \
76  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
77  [&] (int i) {__VA_ARGS__})
78 
79 
80 /// OpenMP backend
81 template <typename HBODY>
82 void OmpWrap(const int N, HBODY &&h_body)
83 {
84 #ifdef MFEM_USE_OPENMP
85  #pragma omp parallel for
86  for (int k = 0; k < N; k++)
87  {
88  h_body(k);
89  }
90 #else
91  MFEM_CONTRACT_VAR(N);
92  MFEM_CONTRACT_VAR(h_body);
93  MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");
94 #endif
95 }
96 
97 
98 /// RAJA Cuda backend
99 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
100 
101 using RAJA::statement::Segs;
102 
103 template <const int BLOCKS = MFEM_CUDA_BLOCKS, typename DBODY>
104 void RajaCudaWrap1D(const int N, DBODY &&d_body)
105 {
106  //true denotes asynchronous kernel
107  RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
108 }
109 
110 template <typename DBODY>
111 void RajaCudaWrap2D(const int N, DBODY &&d_body,
112  const int X, const int Y, const int BZ)
113 {
114  MFEM_VERIFY(N>0, "");
115  MFEM_VERIFY(BZ>0, "");
116  const int G = (N+BZ-1)/BZ;
117  RAJA::kernel<RAJA::KernelPolicy<
118  RAJA::statement::CudaKernelAsync<
119  RAJA::statement::For<0, RAJA::cuda_block_x_direct,
120  RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
121  RAJA::statement::For<2, RAJA::cuda_thread_y_direct,
122  RAJA::statement::For<3, RAJA::cuda_thread_z_direct,
123  RAJA::statement::Lambda<0, Segs<0>>>>>>>>>
124  (RAJA::make_tuple(RAJA::RangeSegment(0,G), RAJA::RangeSegment(0,X),
125  RAJA::RangeSegment(0,Y), RAJA::RangeSegment(0,BZ)),
126  [=] RAJA_DEVICE (const int n)
127  {
128  const int k = n*BZ + threadIdx.z;
129  if (k >= N) { return; }
130  d_body(k);
131  });
132  MFEM_GPU_CHECK(cudaGetLastError());
133 }
134 
135 template <typename DBODY>
136 void RajaCudaWrap3D(const int N, DBODY &&d_body,
137  const int X, const int Y, const int Z)
138 {
139  MFEM_VERIFY(N>0, "");
140  RAJA::kernel<RAJA::KernelPolicy<
141  RAJA::statement::CudaKernelAsync<
142  RAJA::statement::For<0, RAJA::cuda_block_x_direct,
143  RAJA::statement::For<1, RAJA::cuda_thread_x_direct,
144  RAJA::statement::For<2, RAJA::cuda_thread_y_direct,
145  RAJA::statement::For<3, RAJA::cuda_thread_z_direct,
146  RAJA::statement::Lambda<0, Segs<0>>>>>>>>>
147  (RAJA::make_tuple(RAJA::RangeSegment(0,N), RAJA::RangeSegment(0,X),
148  RAJA::RangeSegment(0,Y), RAJA::RangeSegment(0,Z)),
149  [=] RAJA_DEVICE (const int k) { d_body(k); });
150  MFEM_GPU_CHECK(cudaGetLastError());
151 }
152 
153 #endif
154 
155 
156 /// RAJA OpenMP backend
157 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
158 
159 using RAJA::statement::Segs;
160 
161 template <typename HBODY>
162 void RajaOmpWrap(const int N, HBODY &&h_body)
163 {
164  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
165 }
166 
167 #endif
168 
169 
170 /// RAJA sequential loop backend
171 template <typename HBODY>
172 void RajaSeqWrap(const int N, HBODY &&h_body)
173 {
174 #ifdef MFEM_USE_RAJA
175  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
176 #else
177  MFEM_CONTRACT_VAR(N);
178  MFEM_CONTRACT_VAR(h_body);
179  MFEM_ABORT("RAJA requested but RAJA is not enabled!");
180 #endif
181 }
182 
183 
184 /// CUDA backend
185 #ifdef MFEM_USE_CUDA
186 
187 template <typename BODY> __global__ static
188 void CuKernel1D(const int N, BODY body)
189 {
190  const int k = blockDim.x*blockIdx.x + threadIdx.x;
191  if (k >= N) { return; }
192  body(k);
193 }
194 
195 template <typename BODY> __global__ static
196 void CuKernel2D(const int N, BODY body, const int BZ)
197 {
198  const int k = blockIdx.x*BZ + threadIdx.z;
199  if (k >= N) { return; }
200  body(k);
201 }
202 
203 template <typename BODY> __global__ static
204 void CuKernel3D(const int N, BODY body)
205 {
206  const int k = blockIdx.x;
207  if (k >= N) { return; }
208  body(k);
209 }
210 
211 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
212 void CuWrap1D(const int N, DBODY &&d_body)
213 {
214  if (N==0) { return; }
215  const int GRID = (N+BLCK-1)/BLCK;
216  CuKernel1D<<<GRID,BLCK>>>(N, d_body);
217  MFEM_GPU_CHECK(cudaGetLastError());
218 }
219 
220 template <typename DBODY>
221 void CuWrap2D(const int N, DBODY &&d_body,
222  const int X, const int Y, const int BZ)
223 {
224  if (N==0) { return; }
225  MFEM_VERIFY(BZ>0, "");
226  const int GRID = (N+BZ-1)/BZ;
227  const dim3 BLCK(X,Y,BZ);
228  CuKernel2D<<<GRID,BLCK>>>(N,d_body,BZ);
229  MFEM_GPU_CHECK(cudaGetLastError());
230 }
231 
232 template <typename DBODY>
233 void CuWrap3D(const int N, DBODY &&d_body,
234  const int X, const int Y, const int Z)
235 {
236  if (N==0) { return; }
237  const int GRID = N;
238  const dim3 BLCK(X,Y,Z);
239  CuKernel3D<<<GRID,BLCK>>>(N,d_body);
240  MFEM_GPU_CHECK(cudaGetLastError());
241 }
242 
243 #endif // MFEM_USE_CUDA
244 
245 
246 /// HIP backend
247 #ifdef MFEM_USE_HIP
248 
249 template <typename BODY> __global__ static
250 void HipKernel1D(const int N, BODY body)
251 {
252  const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
253  if (k >= N) { return; }
254  body(k);
255 }
256 
257 template <typename BODY> __global__ static
258 void HipKernel2D(const int N, BODY body, const int BZ)
259 {
260  const int k = hipBlockIdx_x*BZ + hipThreadIdx_z;
261  if (k >= N) { return; }
262  body(k);
263 }
264 
265 template <typename BODY> __global__ static
266 void HipKernel3D(const int N, BODY body)
267 {
268  const int k = hipBlockIdx_x;
269  if (k >= N) { return; }
270  body(k);
271 }
272 
273 template <const int BLCK = MFEM_HIP_BLOCKS, typename DBODY>
274 void HipWrap1D(const int N, DBODY &&d_body)
275 {
276  if (N==0) { return; }
277  const int GRID = (N+BLCK-1)/BLCK;
278  hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
279  MFEM_GPU_CHECK(hipGetLastError());
280 }
281 
282 template <typename DBODY>
283 void HipWrap2D(const int N, DBODY &&d_body,
284  const int X, const int Y, const int BZ)
285 {
286  if (N==0) { return; }
287  const int GRID = (N+BZ-1)/BZ;
288  const dim3 BLCK(X,Y,BZ);
289  hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body,BZ);
290  MFEM_GPU_CHECK(hipGetLastError());
291 }
292 
293 template <typename DBODY>
294 void HipWrap3D(const int N, DBODY &&d_body,
295  const int X, const int Y, const int Z)
296 {
297  if (N==0) { return; }
298  const int GRID = N;
299  const dim3 BLCK(X,Y,Z);
300  hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
301  MFEM_GPU_CHECK(hipGetLastError());
302 }
303 
304 #endif // MFEM_USE_HIP
305 
306 
307 /// The forall kernel body wrapper
308 template <const int DIM, typename DBODY, typename HBODY>
309 inline void ForallWrap(const bool use_dev, const int N,
310  DBODY &&d_body, HBODY &&h_body,
311  const int X=0, const int Y=0, const int Z=0)
312 {
313  MFEM_CONTRACT_VAR(X);
314  MFEM_CONTRACT_VAR(Y);
315  MFEM_CONTRACT_VAR(Z);
316  MFEM_CONTRACT_VAR(d_body);
317  if (!use_dev) { goto backend_cpu; }
318 
319 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
320  // Handle all allowed CUDA backends except Backend::CUDA
321  if (DIM == 1 && Device::Allows(Backend::CUDA_MASK & ~Backend::CUDA))
322  { return RajaCudaWrap1D(N, d_body); }
323 
324  if (DIM == 2 && Device::Allows(Backend::CUDA_MASK & ~Backend::CUDA))
325  { return RajaCudaWrap2D(N, d_body, X, Y, Z); }
326 
327  if (DIM == 3 && Device::Allows(Backend::CUDA_MASK & ~Backend::CUDA))
328  { return RajaCudaWrap3D(N, d_body, X, Y, Z); }
329 #endif
330 
331 #ifdef MFEM_USE_CUDA
332  // Handle all allowed CUDA backends
333  if (DIM == 1 && Device::Allows(Backend::CUDA_MASK))
334  { return CuWrap1D(N, d_body); }
335 
336  if (DIM == 2 && Device::Allows(Backend::CUDA_MASK))
337  { return CuWrap2D(N, d_body, X, Y, Z); }
338 
339  if (DIM == 3 && Device::Allows(Backend::CUDA_MASK))
340  { return CuWrap3D(N, d_body, X, Y, Z); }
341 #endif
342 
343 #ifdef MFEM_USE_HIP
344  // Handle all allowed HIP backends
345  if (DIM == 1 && Device::Allows(Backend::HIP_MASK))
346  { return HipWrap1D(N, d_body); }
347 
348  if (DIM == 2 && Device::Allows(Backend::HIP_MASK))
349  { return HipWrap2D(N, d_body, X, Y, Z); }
350 
351  if (DIM == 3 && Device::Allows(Backend::HIP_MASK))
352  { return HipWrap3D(N, d_body, X, Y, Z); }
353 #endif
354 
355  if (Device::Allows(Backend::DEBUG)) { goto backend_cpu; }
356 
357 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
358  // Handle all allowed OpenMP backends except Backend::OMP
360  { return RajaOmpWrap(N, h_body); }
361 #endif
362 
363 #ifdef MFEM_USE_OPENMP
364  // Handle all allowed OpenMP backends
365  if (Device::Allows(Backend::OMP_MASK)) { return OmpWrap(N, h_body); }
366 #endif
367 
368 #ifdef MFEM_USE_RAJA
369  // Handle all allowed CPU backends except Backend::CPU
371  { return RajaSeqWrap(N, h_body); }
372 #endif
373 
374 backend_cpu:
375  // Handle Backend::CPU. This is also a fallback for any allowed backends not
376  // handled above, e.g. OCCA_CPU with configuration 'occa-cpu,cpu', or
377  // OCCA_OMP with configuration 'occa-omp,cpu'.
378  for (int k = 0; k < N; k++) { h_body(k); }
379 }
380 
381 } // namespace mfem
382 
383 #endif // MFEM_FORALL_HPP
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Definition: forall.hpp:172
void RajaOmpWrap(const int N, HBODY &&h_body)
Definition: forall.hpp:162
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:221
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
Definition: forall.hpp:233
Biwise-OR of all HIP backends.
Definition: device.hpp:83
void HipWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:274
void RajaCudaWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:111
const int MAX_Q1D
Definition: forall.hpp:36
void RajaCudaWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:104
Biwise-OR of all OpenMP backends.
Definition: device.hpp:85
void RajaCudaWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
Definition: forall.hpp:136
[device] Debug backend: host memory is READ/WRITE protected while a device is in use. It allows to test the &quot;device&quot; code-path (using separate host/device memory pools and host &lt;-&gt; device transfers) without any GPU hardware.
Definition: device.hpp:68
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
Definition: forall.hpp:294
[host] Default CPU backend: sequential execution on each MPI rank.
Definition: device.hpp:33
Biwise-OR of all CUDA backends.
Definition: device.hpp:81
Biwise-OR of all CPU backends.
Definition: device.hpp:79
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
Definition: device.hpp:35
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition: device.hpp:234
void CuWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:212
void ForallWrap(const bool use_dev, const int N, DBODY &&d_body, HBODY &&h_body, const int X=0, const int Y=0, const int Z=0)
The forall kernel body wrapper.
Definition: forall.hpp:309
const int MAX_D1D
Definition: forall.hpp:35
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:283
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
Definition: device.hpp:37
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.
Definition: forall.hpp:82