MFEM  v4.5.2
Finite element discretization library
forall.hpp
Go to the documentation of this file.
1 // Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
2 // at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3 // LICENSE and NOTICE for details. LLNL-CODE-806117.
4 //
5 // This file is part of the MFEM library. For more information and source code
6 // availability visit https://mfem.org.
7 //
8 // MFEM is free software; you can redistribute it and/or modify it under the
9 // terms of the BSD-3 license. We welcome feedback and contributions, see file
10 // CONTRIBUTING.md for details.
11 
12 #ifndef MFEM_FORALL_HPP
13 #define MFEM_FORALL_HPP
14 
15 #include "../config/config.hpp"
16 #include "annotation.hpp"
17 #include "error.hpp"
18 #include "backends.hpp"
19 #include "device.hpp"
20 #include "mem_manager.hpp"
21 #include "../linalg/dtensor.hpp"
22 
23 namespace mfem
24 {
25 
26 // Maximum size of dofs and quads in 1D.
27 #ifdef MFEM_USE_HIP
28 const int MAX_D1D = 10;
29 const int MAX_Q1D = 10;
30 #else
31 const int MAX_D1D = 14;
32 const int MAX_Q1D = 14;
33 #endif
34 
35 // MFEM pragma macros that can be used inside MFEM_FORALL macros.
36 #define MFEM_PRAGMA(X) _Pragma(#X)
37 
38 // MFEM_UNROLL pragma macro that can be used inside MFEM_FORALL macros.
39 #if defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__)
40 #define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N))
41 #else
42 #define MFEM_UNROLL(N)
43 #endif
44 
45 // MFEM_GPU_FORALL: "parallel for" executed with CUDA or HIP based on the MFEM
46 // build-time configuration (MFEM_USE_CUDA or MFEM_USE_HIP). If neither CUDA nor
47 // HIP is enabled, this macro is a no-op.
48 #if defined(MFEM_USE_CUDA)
49 #define MFEM_GPU_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE \
50  (int i) {__VA_ARGS__})
51 #elif defined(MFEM_USE_HIP)
52 #define MFEM_GPU_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE \
53  (int i) {__VA_ARGS__})
54 #else
55 #define MFEM_GPU_FORALL(i, N,...) do { } while (false)
56 #endif
57 
58 // Implementation of MFEM's "parallel for" (forall) device/host kernel
59 // interfaces supporting RAJA, CUDA, OpenMP, and sequential backends.
60 
61 // The MFEM_FORALL wrapper
62 #define MFEM_FORALL(i,N,...) \
63  ForallWrap<1>(true,N, \
64  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
65  [&] MFEM_LAMBDA (int i) {__VA_ARGS__})
66 
67 // MFEM_FORALL with a 2D CUDA block
68 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
69  ForallWrap<2>(true,N, \
70  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
71  [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
72  X,Y,BZ)
73 
74 // MFEM_FORALL with a 3D CUDA block
75 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
76  ForallWrap<3>(true,N, \
77  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
78  [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
79  X,Y,Z)
80 
81 // MFEM_FORALL with a 3D CUDA block and grid
82 // With G=0, this is the same as MFEM_FORALL_3D(i,N,X,Y,Z,...)
83 #define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \
84  ForallWrap<3>(true,N, \
85  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
86  [&] MFEM_LAMBDA (int i) {__VA_ARGS__},\
87  X,Y,Z,G)
88 
89 // MFEM_FORALL that uses the basic CPU backend when use_dev is false. See for
90 // example the functions in vector.cpp, where we don't want to use the mfem
91 // device for operations on small vectors.
92 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
93  ForallWrap<1>(use_dev,N, \
94  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
95  [&] MFEM_LAMBDA (int i) {__VA_ARGS__})
96 
97 
98 /// OpenMP backend
99 template <typename HBODY>
100 void OmpWrap(const int N, HBODY &&h_body)
101 {
102 #ifdef MFEM_USE_OPENMP
103  #pragma omp parallel for
104  for (int k = 0; k < N; k++)
105  {
106  h_body(k);
107  }
108 #else
109  MFEM_CONTRACT_VAR(N);
110  MFEM_CONTRACT_VAR(h_body);
111  MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");
112 #endif
113 }
114 
115 
116 /// RAJA Cuda and Hip backends
117 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
118 using cuda_launch_policy =
119  RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;
120 using cuda_teams_x =
121  RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
122 using cuda_threads_z =
123  RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>;
124 #endif
125 
126 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
127 using hip_launch_policy =
128  RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>;
129 using hip_teams_x =
130  RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
131 using hip_threads_z =
132  RAJA::LoopPolicy<RAJA::hip_thread_z_direct>;
133 #endif
134 
135 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
136 template <const int BLOCKS = MFEM_CUDA_BLOCKS, typename DBODY>
137 void RajaCuWrap1D(const int N, DBODY &&d_body)
138 {
139  //true denotes asynchronous kernel
140  RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
141 }
142 
143 template <typename DBODY>
144 void RajaCuWrap2D(const int N, DBODY &&d_body,
145  const int X, const int Y, const int BZ)
146 {
147  MFEM_VERIFY(N>0, "");
148  MFEM_VERIFY(BZ>0, "");
149  const int G = (N+BZ-1)/BZ;
150 
151  using namespace RAJA;
152  using RAJA::RangeSegment;
153 
154  launch<cuda_launch_policy>
155  (LaunchParams(Teams(G), Threads(X, Y, BZ)),
156  [=] RAJA_DEVICE (LaunchContext ctx)
157  {
158 
159  loop<cuda_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
160  {
161 
162  loop<cuda_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
163  {
164 
165  const int k = n*BZ + tz;
166  if (k >= N) { return; }
167  d_body(k);
168 
169  });
170 
171  });
172 
173  });
174 
175  MFEM_GPU_CHECK(cudaGetLastError());
176 }
177 
178 template <typename DBODY>
179 void RajaCuWrap3D(const int N, DBODY &&d_body,
180  const int X, const int Y, const int Z, const int G)
181 {
182  MFEM_VERIFY(N>0, "");
183  const int GRID = G == 0 ? N : G;
184  using namespace RAJA;
185  using RAJA::RangeSegment;
186 
187  launch<cuda_launch_policy>
188  (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
189  [=] RAJA_DEVICE (LaunchContext ctx)
190  {
191 
192  loop<cuda_teams_x>(ctx, RangeSegment(0, N), d_body);
193 
194  });
195 
196  MFEM_GPU_CHECK(cudaGetLastError());
197 }
198 
199 template <int Dim>
200 struct RajaCuWrap;
201 
202 template <>
203 struct RajaCuWrap<1>
204 {
205  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
206  static void run(const int N, DBODY &&d_body,
207  const int X, const int Y, const int Z, const int G)
208  {
209  RajaCuWrap1D<BLCK>(N, d_body);
210  }
211 };
212 
213 template <>
214 struct RajaCuWrap<2>
215 {
216  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
217  static void run(const int N, DBODY &&d_body,
218  const int X, const int Y, const int Z, const int G)
219  {
220  RajaCuWrap2D(N, d_body, X, Y, Z);
221  }
222 };
223 
224 template <>
225 struct RajaCuWrap<3>
226 {
227  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
228  static void run(const int N, DBODY &&d_body,
229  const int X, const int Y, const int Z, const int G)
230  {
231  RajaCuWrap3D(N, d_body, X, Y, Z, G);
232  }
233 };
234 
235 #endif
236 
237 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
238 template <const int BLOCKS = MFEM_HIP_BLOCKS, typename DBODY>
239 void RajaHipWrap1D(const int N, DBODY &&d_body)
240 {
241  //true denotes asynchronous kernel
242  RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
243 }
244 
245 template <typename DBODY>
246 void RajaHipWrap2D(const int N, DBODY &&d_body,
247  const int X, const int Y, const int BZ)
248 {
249  MFEM_VERIFY(N>0, "");
250  MFEM_VERIFY(BZ>0, "");
251  const int G = (N+BZ-1)/BZ;
252 
253  using namespace RAJA;
254  using RAJA::RangeSegment;
255 
256  launch<hip_launch_policy>
257  (LaunchParams(Teams(G), Threads(X, Y, BZ)),
258  [=] RAJA_DEVICE (LaunchContext ctx)
259  {
260 
261  loop<hip_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
262  {
263 
264  loop<hip_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
265  {
266 
267  const int k = n*BZ + tz;
268  if (k >= N) { return; }
269  d_body(k);
270 
271  });
272 
273  });
274 
275  });
276 
277  MFEM_GPU_CHECK(hipGetLastError());
278 }
279 
280 template <typename DBODY>
281 void RajaHipWrap3D(const int N, DBODY &&d_body,
282  const int X, const int Y, const int Z, const int G)
283 {
284  MFEM_VERIFY(N>0, "");
285  const int GRID = G == 0 ? N : G;
286  using namespace RAJA;
287  using RAJA::RangeSegment;
288 
289  launch<hip_launch_policy>
290  (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
291  [=] RAJA_DEVICE (LaunchContext ctx)
292  {
293 
294  loop<hip_teams_x>(ctx, RangeSegment(0, N), d_body);
295 
296  });
297 
298  MFEM_GPU_CHECK(hipGetLastError());
299 }
300 
301 template <int Dim>
302 struct RajaHipWrap;
303 
304 template <>
305 struct RajaHipWrap<1>
306 {
307  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
308  static void run(const int N, DBODY &&d_body,
309  const int X, const int Y, const int Z, const int G)
310  {
311  RajaHipWrap1D<BLCK>(N, d_body);
312  }
313 };
314 
315 template <>
316 struct RajaHipWrap<2>
317 {
318  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
319  static void run(const int N, DBODY &&d_body,
320  const int X, const int Y, const int Z, const int G)
321  {
322  RajaHipWrap2D(N, d_body, X, Y, Z);
323  }
324 };
325 
326 template <>
327 struct RajaHipWrap<3>
328 {
329  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
330  static void run(const int N, DBODY &&d_body,
331  const int X, const int Y, const int Z, const int G)
332  {
333  RajaHipWrap3D(N, d_body, X, Y, Z, G);
334  }
335 };
336 
337 #endif
338 
339 /// RAJA OpenMP backend
340 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
341 
342 template <typename HBODY>
343 void RajaOmpWrap(const int N, HBODY &&h_body)
344 {
345  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
346 }
347 
348 #endif
349 
350 
351 /// RAJA sequential loop backend
352 template <typename HBODY>
353 void RajaSeqWrap(const int N, HBODY &&h_body)
354 {
355 #ifdef MFEM_USE_RAJA
356  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
357 #else
358  MFEM_CONTRACT_VAR(N);
359  MFEM_CONTRACT_VAR(h_body);
360  MFEM_ABORT("RAJA requested but RAJA is not enabled!");
361 #endif
362 }
363 
364 
365 /// CUDA backend
366 #ifdef MFEM_USE_CUDA
367 
368 template <typename BODY> __global__ static
369 void CuKernel1D(const int N, BODY body)
370 {
371  const int k = blockDim.x*blockIdx.x + threadIdx.x;
372  if (k >= N) { return; }
373  body(k);
374 }
375 
376 template <typename BODY> __global__ static
377 void CuKernel2D(const int N, BODY body)
378 {
379  const int k = blockIdx.x*blockDim.z + threadIdx.z;
380  if (k >= N) { return; }
381  body(k);
382 }
383 
384 template <typename BODY> __global__ static
385 void CuKernel3D(const int N, BODY body)
386 {
387  for (int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }
388 }
389 
390 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
391 void CuWrap1D(const int N, DBODY &&d_body)
392 {
393  if (N==0) { return; }
394  const int GRID = (N+BLCK-1)/BLCK;
395  CuKernel1D<<<GRID,BLCK>>>(N, d_body);
396  MFEM_GPU_CHECK(cudaGetLastError());
397 }
398 
399 template <typename DBODY>
400 void CuWrap2D(const int N, DBODY &&d_body,
401  const int X, const int Y, const int BZ)
402 {
403  if (N==0) { return; }
404  MFEM_VERIFY(BZ>0, "");
405  const int GRID = (N+BZ-1)/BZ;
406  const dim3 BLCK(X,Y,BZ);
407  CuKernel2D<<<GRID,BLCK>>>(N,d_body);
408  MFEM_GPU_CHECK(cudaGetLastError());
409 }
410 
411 template <typename DBODY>
412 void CuWrap3D(const int N, DBODY &&d_body,
413  const int X, const int Y, const int Z, const int G)
414 {
415  if (N==0) { return; }
416  const int GRID = G == 0 ? N : G;
417  const dim3 BLCK(X,Y,Z);
418  CuKernel3D<<<GRID,BLCK>>>(N,d_body);
419  MFEM_GPU_CHECK(cudaGetLastError());
420 }
421 
422 template <int Dim>
423 struct CuWrap;
424 
425 template <>
426 struct CuWrap<1>
427 {
428  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
429  static void run(const int N, DBODY &&d_body,
430  const int X, const int Y, const int Z, const int G)
431  {
432  CuWrap1D<BLCK>(N, d_body);
433  }
434 };
435 
436 template <>
437 struct CuWrap<2>
438 {
439  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
440  static void run(const int N, DBODY &&d_body,
441  const int X, const int Y, const int Z, const int G)
442  {
443  CuWrap2D(N, d_body, X, Y, Z);
444  }
445 };
446 
447 template <>
448 struct CuWrap<3>
449 {
450  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
451  static void run(const int N, DBODY &&d_body,
452  const int X, const int Y, const int Z, const int G)
453  {
454  CuWrap3D(N, d_body, X, Y, Z, G);
455  }
456 };
457 
458 #endif // MFEM_USE_CUDA
459 
460 
461 /// HIP backend
462 #ifdef MFEM_USE_HIP
463 
464 template <typename BODY> __global__ static
465 void HipKernel1D(const int N, BODY body)
466 {
467  const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
468  if (k >= N) { return; }
469  body(k);
470 }
471 
472 template <typename BODY> __global__ static
473 void HipKernel2D(const int N, BODY body)
474 {
475  const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;
476  if (k >= N) { return; }
477  body(k);
478 }
479 
480 template <typename BODY> __global__ static
481 void HipKernel3D(const int N, BODY body)
482 {
483  for (int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }
484 }
485 
486 template <const int BLCK = MFEM_HIP_BLOCKS, typename DBODY>
487 void HipWrap1D(const int N, DBODY &&d_body)
488 {
489  if (N==0) { return; }
490  const int GRID = (N+BLCK-1)/BLCK;
491  hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
492  MFEM_GPU_CHECK(hipGetLastError());
493 }
494 
495 template <typename DBODY>
496 void HipWrap2D(const int N, DBODY &&d_body,
497  const int X, const int Y, const int BZ)
498 {
499  if (N==0) { return; }
500  const int GRID = (N+BZ-1)/BZ;
501  const dim3 BLCK(X,Y,BZ);
502  hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body);
503  MFEM_GPU_CHECK(hipGetLastError());
504 }
505 
506 template <typename DBODY>
507 void HipWrap3D(const int N, DBODY &&d_body,
508  const int X, const int Y, const int Z, const int G)
509 {
510  if (N==0) { return; }
511  const int GRID = G == 0 ? N : G;
512  const dim3 BLCK(X,Y,Z);
513  hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
514  MFEM_GPU_CHECK(hipGetLastError());
515 }
516 
517 template <int Dim>
518 struct HipWrap;
519 
520 template <>
521 struct HipWrap<1>
522 {
523  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
524  static void run(const int N, DBODY &&d_body,
525  const int X, const int Y, const int Z, const int G)
526  {
527  HipWrap1D<BLCK>(N, d_body);
528  }
529 };
530 
531 template <>
532 struct HipWrap<2>
533 {
534  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
535  static void run(const int N, DBODY &&d_body,
536  const int X, const int Y, const int Z, const int G)
537  {
538  HipWrap2D(N, d_body, X, Y, Z);
539  }
540 };
541 
542 template <>
543 struct HipWrap<3>
544 {
545  template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
546  static void run(const int N, DBODY &&d_body,
547  const int X, const int Y, const int Z, const int G)
548  {
549  HipWrap3D(N, d_body, X, Y, Z, G);
550  }
551 };
552 
553 #endif // MFEM_USE_HIP
554 
555 
556 /// The forall kernel body wrapper
557 template <const int DIM, typename DBODY, typename HBODY>
558 inline void ForallWrap(const bool use_dev, const int N,
559  DBODY &&d_body, HBODY &&h_body,
560  const int X=0, const int Y=0, const int Z=0,
561  const int G=0)
562 {
563  MFEM_CONTRACT_VAR(X);
564  MFEM_CONTRACT_VAR(Y);
565  MFEM_CONTRACT_VAR(Z);
566  MFEM_CONTRACT_VAR(G);
567  MFEM_CONTRACT_VAR(d_body);
568  if (!use_dev) { goto backend_cpu; }
569 
570 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
571  // If Backend::RAJA_CUDA is allowed, use it
573  {
574  return RajaCuWrap<DIM>::run(N, d_body, X, Y, Z, G);
575  }
576 #endif
577 
578 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
579  // If Backend::RAJA_HIP is allowed, use it
581  {
582  return RajaHipWrap<DIM>::run(N, d_body, X, Y, Z, G);
583  }
584 #endif
585 
586 #ifdef MFEM_USE_CUDA
587  // If Backend::CUDA is allowed, use it
589  {
590  return CuWrap<DIM>::run(N, d_body, X, Y, Z, G);
591  }
592 #endif
593 
594 #ifdef MFEM_USE_HIP
595  // If Backend::HIP is allowed, use it
597  {
598  return HipWrap<DIM>::run(N, d_body, X, Y, Z, G);
599  }
600 #endif
601 
602  // If Backend::DEBUG_DEVICE is allowed, use it
603  if (Device::Allows(Backend::DEBUG_DEVICE)) { goto backend_cpu; }
604 
605 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
606  // If Backend::RAJA_OMP is allowed, use it
607  if (Device::Allows(Backend::RAJA_OMP)) { return RajaOmpWrap(N, h_body); }
608 #endif
609 
610 #ifdef MFEM_USE_OPENMP
611  // If Backend::OMP is allowed, use it
612  if (Device::Allows(Backend::OMP)) { return OmpWrap(N, h_body); }
613 #endif
614 
615 #ifdef MFEM_USE_RAJA
616  // If Backend::RAJA_CPU is allowed, use it
617  if (Device::Allows(Backend::RAJA_CPU)) { return RajaSeqWrap(N, h_body); }
618 #endif
619 
620 backend_cpu:
621  // Handle Backend::CPU. This is also a fallback for any allowed backends not
622  // handled above, e.g. OCCA_CPU with configuration 'occa-cpu,cpu', or
623  // OCCA_OMP with configuration 'occa-omp,cpu'.
624  for (int k = 0; k < N; k++) { h_body(k); }
625 }
626 
627 } // namespace mfem
628 
629 #endif // MFEM_FORALL_HPP
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:412
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:524
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:507
void ForallWrap(const bool use_dev, const int N, DBODY &&d_body, HBODY &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
Definition: forall.hpp:558
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Definition: forall.hpp:353
RAJA::LaunchPolicy< RAJA::cuda_launch_t< true > > cuda_launch_policy
RAJA Cuda and Hip backends.
Definition: forall.hpp:119
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:206
RAJA::LaunchPolicy< RAJA::hip_launch_t< true > > hip_launch_policy
Definition: forall.hpp:128
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:319
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
Definition: forall.hpp:343
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:400
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES...
Definition: device.hpp:45
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:330
void HipWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:487
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:179
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
Definition: device.hpp:48
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:440
RAJA::LoopPolicy< RAJA::cuda_thread_z_direct > cuda_threads_z
Definition: forall.hpp:123
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:281
const int MAX_Q1D
Definition: forall.hpp:29
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES...
Definition: device.hpp:42
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:429
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:546
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
Definition: device.hpp:35
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:535
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition: device.hpp:258
void CuWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:391
RAJA::LoopPolicy< RAJA::cuda_block_x_direct > cuda_teams_x
Definition: forall.hpp:121
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:451
RAJA::LoopPolicy< RAJA::hip_block_x_direct > hip_teams_x
Definition: forall.hpp:130
const int MAX_D1D
Definition: forall.hpp:28
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:217
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:496
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:246
void RajaCuWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:137
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:144
void RajaHipWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:239
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:228
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
Definition: device.hpp:51
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:308
RAJA::LoopPolicy< RAJA::hip_thread_z_direct > hip_threads_z
Definition: forall.hpp:132
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
Definition: device.hpp:39
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
Definition: device.hpp:37
[device] Debug backend: host memory is READ/WRITE protected while a device is in use. It allows to test the "device" code-path (using separate host/device memory pools and host <-> device transfers) without any GPU hardware. As &#39;DEBUG&#39; is sometimes used as a macro, _DEVICE has been added to avoid conflicts.
Definition: device.hpp:75
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.
Definition: forall.hpp:100