MFEM v4.9.0
Finite element discretization library
Loading...
Searching...
No Matches
forall.hpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#ifndef MFEM_FORALL_HPP
13#define MFEM_FORALL_HPP
14
15#include "../config/config.hpp"
16#include "annotation.hpp"
17#include "error.hpp"
18#include "backends.hpp"
19#include "device.hpp"
20#include "mem_manager.hpp"
21#include "../linalg/dtensor.hpp"
22#ifdef MFEM_USE_MPI
23#include <_hypre_utilities.h>
24#endif
25
26namespace mfem
27{
28
29// The following DofQuadLimit_ structs define the maximum values of D1D and Q1D
30// often used in the "fallback kernels" for partial assembly. Different limits
31// take effect for different architectures. The limits should be queried using
32// the public interface in DeviceDofQuadLimits or DofQuadLimits, and generally
33// not be directly accessing the structs defined below.
34//
35// In host code, the limits associated with the currently configured Device can
36// be accessed using DeviceDofQuadLimits::Get().
37//
38// In mfem::forall kernels or MFEM_HOST_DEVICE functions, the limits
39// corresponding to the architecture the function is being compiled for can be
40// accessed as static constexpr variables using the type alias DofQuadLimits.
41
42namespace internal
43{
44
45struct DofQuadLimits_CUDA
46{
47 static constexpr int MAX_D1D = 14;
48 static constexpr int MAX_Q1D = 14;
49 static constexpr int MAX_T1D = 32;
50 static constexpr int HCURL_MAX_D1D = 5;
51 static constexpr int HCURL_MAX_Q1D = 6;
52 static constexpr int HDIV_MAX_D1D = 5;
53 static constexpr int HDIV_MAX_Q1D = 6;
54 static constexpr int MAX_INTERP_1D = 8;
55 static constexpr int MAX_DET_1D = 6;
56};
57
58struct DofQuadLimits_HIP
59{
60 static constexpr int MAX_D1D = 10;
61 static constexpr int MAX_Q1D = 10;
62 static constexpr int MAX_T1D = 32;
63 static constexpr int HCURL_MAX_D1D = 5;
64 static constexpr int HCURL_MAX_Q1D = 5;
65 static constexpr int HDIV_MAX_D1D = 5;
66 static constexpr int HDIV_MAX_Q1D = 6;
67 static constexpr int MAX_INTERP_1D = 8;
68 static constexpr int MAX_DET_1D = 6;
69};
70
71struct DofQuadLimits_CPU
72{
73#ifndef _WIN32
74 static constexpr int MAX_D1D = 24;
75 static constexpr int MAX_Q1D = 24;
76#else
77 static constexpr int MAX_D1D = 14;
78 static constexpr int MAX_Q1D = 14;
79#endif
80 static constexpr int MAX_T1D = 32;
81 static constexpr int HCURL_MAX_D1D = 10;
82 static constexpr int HCURL_MAX_Q1D = 10;
83 static constexpr int HDIV_MAX_D1D = 10;
84 static constexpr int HDIV_MAX_Q1D = 10;
85 static constexpr int MAX_INTERP_1D = MAX_D1D;
86 static constexpr int MAX_DET_1D = MAX_D1D;
87};
88
89} // namespace internal
90
91/// @brief Maximum number of 1D DOFs or quadrature points for the architecture
92/// currently being compiled for (used in fallback kernels).
93///
94/// DofQuadLimits provides access to the limits as static constexpr member
95/// variables for use in mfem::forall kernels or MFEM_HOST_DEVICE functions.
96///
97/// @sa For accessing the limits according to the runtime configuration of the
98/// Device, see DeviceDofQuadLimits.
99#if defined(__CUDA_ARCH__)
100using DofQuadLimits = internal::DofQuadLimits_CUDA;
101#elif defined(__HIP_DEVICE_COMPILE__)
102using DofQuadLimits = internal::DofQuadLimits_HIP;
103#else
104using DofQuadLimits = internal::DofQuadLimits_CPU;
105#endif
106
107/// @brief Maximum number of 1D DOFs or quadrature points for the current
108/// runtime configuration of the Device (used in fallback kernels).
109///
110/// DeviceDofQuadLimits can be used in host code to query the limits for the
111/// configured device (e.g. when the user has selected GPU execution at
112/// runtime).
113///
114/// @sa For accessing the limits according to the current compiler pass, see
115/// DofQuadLimits.
117{
118 int MAX_D1D; ///< Maximum number of 1D nodal points.
119 int MAX_Q1D; ///< Maximum number of 1D quadrature points.
120 int HCURL_MAX_D1D; ///< Maximum number of 1D nodal points for H(curl).
121 int HCURL_MAX_Q1D; ///< Maximum number of 1D quadrature points for H(curl).
122 int HDIV_MAX_D1D; ///< Maximum number of 1D nodal points for H(div).
123 int HDIV_MAX_Q1D; ///< Maximum number of 1D quadrature points for H(div).
124 int MAX_INTERP_1D; ///< Maximum number of points for use in QuadratureInterpolator.
125 int MAX_DET_1D; ///< Maximum number of points for determinant computation in QuadratureInterpolator.
126
127 /// Return a const reference to the DeviceDofQuadLimits singleton.
128 static const DeviceDofQuadLimits &Get()
129 {
130 static const DeviceDofQuadLimits dof_quad_limits;
131 return dof_quad_limits;
132 }
133
134private:
135 /// Initialize the limits depending on the configuration of the Device.
137 {
138 if (Device::Allows(Backend::CUDA_MASK)) { Populate<internal::DofQuadLimits_CUDA>(); }
139 else if (Device::Allows(Backend::HIP_MASK)) { Populate<internal::DofQuadLimits_HIP>(); }
140 else { Populate<internal::DofQuadLimits_CPU>(); }
141 }
142
143 /// @brief Set the limits using the static members of the type @a T.
144 ///
145 /// @a T should be one of DofQuadLimits_CUDA, DofQuadLimits_HIP, or
146 /// DofQuadLimits_CPU.
147 template <typename T> void Populate()
148 {
149 MAX_D1D = T::MAX_D1D;
150 MAX_Q1D = T::MAX_Q1D;
151 HCURL_MAX_D1D = T::HCURL_MAX_D1D;
152 HCURL_MAX_Q1D = T::HCURL_MAX_Q1D;
153 HDIV_MAX_D1D = T::HDIV_MAX_D1D;
154 HDIV_MAX_Q1D = T::HDIV_MAX_Q1D;
155 MAX_INTERP_1D = T::MAX_INTERP_1D;
156 MAX_DET_1D = T::MAX_DET_1D;
157 }
158};
159
160// MFEM pragma macros that can be used inside MFEM_FORALL macros.
161#define MFEM_PRAGMA(X) _Pragma(#X)
162
163// MFEM_UNROLL pragma macro that can be used inside MFEM_FORALL macros.
164#if defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__) // Clang cuda or nvcc
165#ifdef __NVCC__ // nvcc specifically
166#define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N))
167#else // Assuming Clang CUDA
168#define MFEM_UNROLL(N) MFEM_PRAGMA(unroll N)
169#endif
170#else
171#define MFEM_UNROLL(N)
172#endif
173
174// MFEM_GPU_FORALL: "parallel for" executed with CUDA or HIP based on the MFEM
175// build-time configuration (MFEM_USE_CUDA or MFEM_USE_HIP), and if compiling
176// with CUDA/HIP language. Otherwise, this macro is a no-op.
177#if defined(MFEM_USE_CUDA) && defined(__CUDACC__)
178#define MFEM_GPU_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE \
179 (int i) {__VA_ARGS__})
180#elif defined(MFEM_USE_HIP) && defined(__HIP__)
181#define MFEM_GPU_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE \
182 (int i) {__VA_ARGS__})
183#else
184#define MFEM_GPU_FORALL(i, N,...) do { } while (false)
185#endif
186
187// Implementation of MFEM's "parallel for" (forall) device/host kernel
188// interfaces supporting RAJA, CUDA, OpenMP, and sequential backends.
189
190// The MFEM_FORALL wrapper
191#define MFEM_FORALL(i,N,...) \
192 ForallWrap<1>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})
193
194// MFEM_FORALL with a 2D CUDA block
195#define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
196 ForallWrap<2>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,BZ)
197
198// MFEM_FORALL with a 3D CUDA block
199#define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
200 ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z)
201
202// MFEM_FORALL with a 3D CUDA block and grid
203// With G=0, this is the same as MFEM_FORALL_3D(i,N,X,Y,Z,...)
204#define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \
205 ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z,G)
206
207// MFEM_FORALL that uses the basic CPU backend when use_dev is false. See for
208// example the functions in vector.cpp, where we don't want to use the mfem
209// device for operations on small vectors.
210#define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
211 ForallWrap<1>(use_dev,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})
212
213
214/// OpenMP backend
215template <typename HBODY>
216void OmpWrap(const int N, HBODY &&h_body)
217{
218#ifdef MFEM_USE_OPENMP
219 #pragma omp parallel for
220 for (int k = 0; k < N; k++)
221 {
222 h_body(k);
223 }
224#else
225 MFEM_CONTRACT_VAR(N);
226 MFEM_CONTRACT_VAR(h_body);
227 MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");
228#endif
229}
230
231template <typename HBODY>
232void OmpWrap2D(const int Nx, const int Ny, HBODY &&h_body)
233{
234#ifdef MFEM_USE_OPENMP
235 // requires OpenMP 3.1
236 #pragma omp parallel for collapse(2)
237 for (int j = 0; j < Ny; j++)
238 {
239 for (int i = 0; i < Nx; i++)
240 {
241 h_body(i, j);
242 }
243 }
244#else
245 MFEM_CONTRACT_VAR(Nx);
246 MFEM_CONTRACT_VAR(Ny);
247 MFEM_CONTRACT_VAR(h_body);
248 MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");
249#endif
250}
251
252template <typename HBODY>
253void OmpWrap3D(const int Nx, const int Ny, const int Nz, HBODY &&h_body)
254{
255#ifdef MFEM_USE_OPENMP
256 // requires OpenMP 3.1
257 #pragma omp parallel for collapse(3)
258 for (int k = 0; k < Nz; k++)
259 {
260 for (int j = 0; j < Ny; j++)
261 {
262 for (int i = 0; i < Nx; i++)
263 {
264 h_body(i, j, k);
265 }
266 }
267 }
268#else
269 MFEM_CONTRACT_VAR(Nx);
270 MFEM_CONTRACT_VAR(Ny);
271 MFEM_CONTRACT_VAR(Nz);
272 MFEM_CONTRACT_VAR(h_body);
273 MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");
274#endif
275}
276
277
278/// RAJA Cuda and Hip backends
279#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
281 RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;
283 RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
285 RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>;
286#endif
287
288#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP) && defined(__HIP__)
290 RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>;
292 RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
294 RAJA::LoopPolicy<RAJA::hip_thread_z_direct>;
295#endif
296
297#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
298template <const int BLOCKS = MFEM_CUDA_BLOCKS, typename DBODY>
299void RajaCuWrap1D(const int N, DBODY &&d_body)
300{
301 //true denotes asynchronous kernel
302 RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
303}
304
305template <typename DBODY>
306void RajaCuWrap2D(const int N, DBODY &&d_body,
307 const int X, const int Y, const int BZ)
308{
309 MFEM_VERIFY(BZ>0, "");
310 const int G = (N+BZ-1)/BZ;
311
312 using namespace RAJA;
313 using RAJA::RangeSegment;
314
315 launch<cuda_launch_policy>
316 (LaunchParams(Teams(G), Threads(X, Y, BZ)),
317 [=] RAJA_DEVICE (LaunchContext ctx)
318 {
319
320 loop<cuda_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
321 {
322
323 loop<cuda_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
324 {
325
326 const int k = n*BZ + tz;
327 if (k >= N) { return; }
328 d_body(k);
329
330 });
331
332 });
333
334 });
335
336 MFEM_GPU_CHECK(cudaGetLastError());
337}
338
339template <typename DBODY>
340void RajaCuWrap3D(const int N, DBODY &&d_body,
341 const int X, const int Y, const int Z, const int G)
342{
343 const int GRID = G == 0 ? N : G;
344 using namespace RAJA;
345 using RAJA::RangeSegment;
346
347 launch<cuda_launch_policy>
348 (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
349 [=] RAJA_DEVICE (LaunchContext ctx)
350 {
351
352 loop<cuda_teams_x>(ctx, RangeSegment(0, N), d_body);
353
354 });
355
356 MFEM_GPU_CHECK(cudaGetLastError());
357}
358
359template <int Dim>
361
362template <>
363struct RajaCuWrap<1>
364{
365 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
366 static void run(const int N, DBODY &&d_body,
367 const int X, const int Y, const int Z, const int G)
368 {
369 RajaCuWrap1D<BLCK>(N, d_body);
370 }
371};
372
373template <>
374struct RajaCuWrap<2>
375{
376 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
377 static void run(const int N, DBODY &&d_body,
378 const int X, const int Y, const int Z, const int G)
379 {
380 RajaCuWrap2D(N, d_body, X, Y, Z);
381 }
382};
383
384template <>
385struct RajaCuWrap<3>
386{
387 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
388 static void run(const int N, DBODY &&d_body,
389 const int X, const int Y, const int Z, const int G)
390 {
391 RajaCuWrap3D(N, d_body, X, Y, Z, G);
392 }
393};
394
395#endif
396
397#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP) && defined(__HIP__)
398template <const int BLOCKS = MFEM_HIP_BLOCKS, typename DBODY>
399void RajaHipWrap1D(const int N, DBODY &&d_body)
400{
401 //true denotes asynchronous kernel
402 RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
403}
404
405template <typename DBODY>
406void RajaHipWrap2D(const int N, DBODY &&d_body,
407 const int X, const int Y, const int BZ)
408{
409 MFEM_VERIFY(BZ>0, "");
410 const int G = (N+BZ-1)/BZ;
411
412 using namespace RAJA;
413 using RAJA::RangeSegment;
414
415 launch<hip_launch_policy>
416 (LaunchParams(Teams(G), Threads(X, Y, BZ)),
417 [=] RAJA_DEVICE (LaunchContext ctx)
418 {
419
420 loop<hip_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
421 {
422
423 loop<hip_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
424 {
425
426 const int k = n*BZ + tz;
427 if (k >= N) { return; }
428 d_body(k);
429
430 });
431
432 });
433
434 });
435
436 MFEM_GPU_CHECK(hipGetLastError());
437}
438
439template <typename DBODY>
440void RajaHipWrap3D(const int N, DBODY &&d_body,
441 const int X, const int Y, const int Z, const int G)
442{
443 const int GRID = G == 0 ? N : G;
444 using namespace RAJA;
445 using RAJA::RangeSegment;
446
447 launch<hip_launch_policy>
448 (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
449 [=] RAJA_DEVICE (LaunchContext ctx)
450 {
451
452 loop<hip_teams_x>(ctx, RangeSegment(0, N), d_body);
453
454 });
455
456 MFEM_GPU_CHECK(hipGetLastError());
457}
458
459template <int Dim>
461
462template <>
463struct RajaHipWrap<1>
464{
465 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
466 static void run(const int N, DBODY &&d_body,
467 const int X, const int Y, const int Z, const int G)
468 {
469 RajaHipWrap1D<BLCK>(N, d_body);
470 }
471};
472
473template <>
474struct RajaHipWrap<2>
475{
476 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
477 static void run(const int N, DBODY &&d_body,
478 const int X, const int Y, const int Z, const int G)
479 {
480 RajaHipWrap2D(N, d_body, X, Y, Z);
481 }
482};
483
484template <>
485struct RajaHipWrap<3>
486{
487 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
488 static void run(const int N, DBODY &&d_body,
489 const int X, const int Y, const int Z, const int G)
490 {
491 RajaHipWrap3D(N, d_body, X, Y, Z, G);
492 }
493};
494
495#endif
496
497/// RAJA OpenMP backend
498#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
499
500template <typename HBODY>
501void RajaOmpWrap(const int N, HBODY &&h_body)
502{
503 RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
504}
505
506template <typename HBODY>
507void RajaOmpWrap2D(const int Nx, const int Ny, HBODY &&h_body)
508{
509 using omp_launch_policy = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
510 using global_thread_xy = RAJA::LoopPolicy<RAJA::omp_for_exec>;
511 RAJA::RangeSegment xrange(0, Nx);
512 RAJA::RangeSegment yrange(0, Ny);
513 RAJA::launch<omp_launch_policy>(RAJA::ExecPlace::HOST, RAJA::LaunchParams(),
514 [=](RAJA::LaunchContext ctx)
515 {
516 // contiguous in x
517 RAJA::expt::loop<global_thread_xy>(ctx, xrange, yrange, [&](int i, int j)
518 {
519 h_body(i, j);
520 });
521 });
522}
523
524template <typename HBODY>
525void RajaOmpWrap3D(const int Nx, const int Ny, const int Nz, HBODY &&h_body)
526{
527 using omp_launch_policy = RAJA::LaunchPolicy<RAJA::omp_launch_t>;
528 using global_thread_xyz = RAJA::LoopPolicy<RAJA::omp_for_exec>;
529 RAJA::RangeSegment xrange(0, Nx);
530 RAJA::RangeSegment yrange(0, Ny);
531 RAJA::RangeSegment zrange(0, Nz);
532 RAJA::launch<omp_launch_policy>(RAJA::ExecPlace::HOST, RAJA::LaunchParams(),
533 [=](RAJA::LaunchContext ctx)
534 {
535 // contiguous in x
536 RAJA::expt::loop<global_thread_xyz>(ctx, xrange, yrange, zrange,
537 [&](int i, int j, int k)
538 { h_body(i, j, k); });
539 });
540}
541
542#endif
543
544
545/// RAJA sequential loop backend
546template <typename HBODY>
547void RajaSeqWrap(const int N, HBODY &&h_body)
548{
549#ifdef MFEM_USE_RAJA
550
551#if (RAJA_VERSION_MAJOR >= 2023)
552 //loop_exec was marked deprecated in RAJA version 2023.06.0
553 //and will be removed. We now use seq_exec.
554 using raja_forall_pol = RAJA::seq_exec;
555#else
556 using raja_forall_pol = RAJA::loop_exec;
557#endif
558
559 RAJA::forall<raja_forall_pol>(RAJA::RangeSegment(0,N), h_body);
560#else
561 MFEM_CONTRACT_VAR(N);
562 MFEM_CONTRACT_VAR(h_body);
563 MFEM_ABORT("RAJA requested but RAJA is not enabled!");
564#endif
565}
566
567
568/// CUDA backend
569#if defined(MFEM_USE_CUDA) && defined(__CUDACC__)
570
571template <typename BODY> __global__ static
572void CuKernel1D(const int N, BODY body)
573{
574 const int k = blockDim.x*blockIdx.x + threadIdx.x;
575 if (k >= N) { return; }
576 body(k);
577}
578
579template <typename BODY> __global__ static
580void CuKernel2D(const int N, BODY body)
581{
582 const int k = blockIdx.x*blockDim.z + threadIdx.z;
583 if (k >= N) { return; }
584 body(k);
585}
586
587template <typename BODY> __global__ static
588void CuKernel3D(const int N, BODY body)
589{
590 for (int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }
591}
592
593template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
594void CuWrap1D(const int N, DBODY &&d_body)
595{
596 if (N==0) { return; }
597 const int GRID = (N+BLCK-1)/BLCK;
598 CuKernel1D<<<GRID,BLCK>>>(N, d_body);
599 MFEM_GPU_CHECK(cudaGetLastError());
600}
601
602template <typename DBODY>
603void CuWrap2D(const int N, DBODY &&d_body,
604 const int X, const int Y, const int BZ)
605{
606 if (N==0) { return; }
607 MFEM_VERIFY(BZ>0, "");
608 const int GRID = (N+BZ-1)/BZ;
609 const dim3 BLCK(X,Y,BZ);
610 CuKernel2D<<<GRID,BLCK>>>(N,d_body);
611 MFEM_GPU_CHECK(cudaGetLastError());
612}
613
614template <typename DBODY>
615void CuWrap3D(const int N, DBODY &&d_body,
616 const int X, const int Y, const int Z, const int G)
617{
618 if (N==0) { return; }
619 const int GRID = G == 0 ? N : G;
620 const dim3 BLCK(X,Y,Z);
621 CuKernel3D<<<GRID,BLCK>>>(N,d_body);
622 MFEM_GPU_CHECK(cudaGetLastError());
623}
624
625template <int Dim>
626struct CuWrap;
627
628template <>
629struct CuWrap<1>
630{
631 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
632 static void run(const int N, DBODY &&d_body,
633 const int X, const int Y, const int Z, const int G)
634 {
635 CuWrap1D<BLCK>(N, d_body);
636 }
637};
638
639template <>
640struct CuWrap<2>
641{
642 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
643 static void run(const int N, DBODY &&d_body,
644 const int X, const int Y, const int Z, const int G)
645 {
646 CuWrap2D(N, d_body, X, Y, Z);
647 }
648};
649
650template <>
651struct CuWrap<3>
652{
653 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
654 static void run(const int N, DBODY &&d_body,
655 const int X, const int Y, const int Z, const int G)
656 {
657 CuWrap3D(N, d_body, X, Y, Z, G);
658 }
659};
660
661#endif // defined(MFEM_USE_CUDA) && defined(__CUDACC__)
662
663
664/// HIP backend
665#if defined(MFEM_USE_HIP) && defined(__HIP__)
666
667template <typename BODY> __global__ static
668void HipKernel1D(const int N, BODY body)
669{
670 const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
671 if (k >= N) { return; }
672 body(k);
673}
674
675template <typename BODY> __global__ static
676void HipKernel2D(const int N, BODY body)
677{
678 const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;
679 if (k >= N) { return; }
680 body(k);
681}
682
683template <typename BODY> __global__ static
684void HipKernel3D(const int N, BODY body)
685{
686 for (int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }
687}
688
689template <const int BLCK = MFEM_HIP_BLOCKS, typename DBODY>
690void HipWrap1D(const int N, DBODY &&d_body)
691{
692 if (N==0) { return; }
693 const int GRID = (N+BLCK-1)/BLCK;
694 hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,nullptr,N,d_body);
695 MFEM_GPU_CHECK(hipGetLastError());
696}
697
698template <typename DBODY>
699void HipWrap2D(const int N, DBODY &&d_body,
700 const int X, const int Y, const int BZ)
701{
702 if (N==0) { return; }
703 const int GRID = (N+BZ-1)/BZ;
704 const dim3 BLCK(X,Y,BZ);
705 hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,nullptr,N,d_body);
706 MFEM_GPU_CHECK(hipGetLastError());
707}
708
709template <typename DBODY>
710void HipWrap3D(const int N, DBODY &&d_body,
711 const int X, const int Y, const int Z, const int G)
712{
713 if (N==0) { return; }
714 const int GRID = G == 0 ? N : G;
715 const dim3 BLCK(X,Y,Z);
716 hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,nullptr,N,d_body);
717 MFEM_GPU_CHECK(hipGetLastError());
718}
719
720template <int Dim>
721struct HipWrap;
722
723template <>
724struct HipWrap<1>
725{
726 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
727 static void run(const int N, DBODY &&d_body,
728 const int X, const int Y, const int Z, const int G)
729 {
730 HipWrap1D<BLCK>(N, d_body);
731 }
732};
733
734template <>
735struct HipWrap<2>
736{
737 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
738 static void run(const int N, DBODY &&d_body,
739 const int X, const int Y, const int Z, const int G)
740 {
741 HipWrap2D(N, d_body, X, Y, Z);
742 }
743};
744
745template <>
746struct HipWrap<3>
747{
748 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
749 static void run(const int N, DBODY &&d_body,
750 const int X, const int Y, const int Z, const int G)
751 {
752 HipWrap3D(N, d_body, X, Y, Z, G);
753 }
754};
755
756#endif // defined(MFEM_USE_HIP) && defined(__HIP__)
757
758
759/// The forall kernel body wrapper
760template <const int DIM, typename d_lambda, typename h_lambda>
761inline void ForallWrap(const bool use_dev, const int N,
762 d_lambda &&d_body, h_lambda &&h_body,
763 const int X=0, const int Y=0, const int Z=0,
764 const int G=0)
765{
766 MFEM_CONTRACT_VAR(X);
767 MFEM_CONTRACT_VAR(Y);
768 MFEM_CONTRACT_VAR(Z);
769 MFEM_CONTRACT_VAR(G);
770 MFEM_CONTRACT_VAR(d_body);
771 if (!use_dev) { goto backend_cpu; }
772
773#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
774 // If Backend::RAJA_CUDA is allowed, use it
776 {
777 return RajaCuWrap<DIM>::run(N, d_body, X, Y, Z, G);
778 }
779#endif
780
781#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP) && defined(__HIP__)
782 // If Backend::RAJA_HIP is allowed, use it
784 {
785 return RajaHipWrap<DIM>::run(N, d_body, X, Y, Z, G);
786 }
787#endif
788
789#if defined(MFEM_USE_CUDA) && defined(__CUDACC__)
790 // If Backend::CUDA is allowed, use it
792 {
793 return CuWrap<DIM>::run(N, d_body, X, Y, Z, G);
794 }
795#endif
796
797#if defined(MFEM_USE_HIP) && defined(__HIP__)
798 // If Backend::HIP is allowed, use it
800 {
801 return HipWrap<DIM>::run(N, d_body, X, Y, Z, G);
802 }
803#endif
804
805 // If Backend::DEBUG_DEVICE is allowed, use it
806 if (Device::Allows(Backend::DEBUG_DEVICE)) { goto backend_cpu; }
807
808#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
809 // If Backend::RAJA_OMP is allowed, use it
810 if (Device::Allows(Backend::RAJA_OMP)) { return RajaOmpWrap(N, h_body); }
811#endif
812
813#ifdef MFEM_USE_OPENMP
814 // If Backend::OMP is allowed, use it
815 if (Device::Allows(Backend::OMP)) { return OmpWrap(N, h_body); }
816#endif
817
818#ifdef MFEM_USE_RAJA
819 // If Backend::RAJA_CPU is allowed, use it
820 if (Device::Allows(Backend::RAJA_CPU)) { return RajaSeqWrap(N, h_body); }
821#endif
822
823backend_cpu:
824 // Handle Backend::CPU. This is also a fallback for any allowed backends not
825 // handled above, e.g. OCCA_CPU with configuration 'occa-cpu,cpu', or
826 // OCCA_OMP with configuration 'occa-omp,cpu'.
827 for (int k = 0; k < N; k++) { h_body(k); }
828}
829
830template <const int DIM, typename lambda>
831inline void ForallWrap(const bool use_dev, const int N, lambda &&body,
832 const int X=0, const int Y=0, const int Z=0,
833 const int G=0)
834{
835 ForallWrap<DIM>(use_dev, N, body, body, X, Y, Z, G);
836}
837
838template<typename lambda>
839inline void forall(int N, lambda &&body) { ForallWrap<1>(true, N, body); }
840
841template<typename lambda>
842inline void forall(int Nx, int Ny, lambda &&body)
843{
845 {
846 forall(Nx * Ny, [=] MFEM_HOST_DEVICE(int idx)
847 {
848 int j = idx / Nx;
849 int i = idx % Nx;
850 body(i, j);
851 });
852 }
853#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
855 {
856 return RajaOmpWrap2D(Nx, Ny, body);
857 }
858#endif
859#ifdef MFEM_USE_OPENMP
861 {
862 return OmpWrap2D(Nx, Ny, body);
863 }
864#endif
865 else
866 {
867 for (int j = 0; j < Ny; ++j)
868 {
869 for (int i = 0; i < Nx; ++i)
870 {
871 body(i, j);
872 }
873 }
874 }
875}
876
877template<typename lambda>
878inline void forall(int Nx, int Ny, int Nz, lambda &&body)
879{
881 {
882 forall(Nx * Ny * Nz, [=] MFEM_HOST_DEVICE(int idx)
883 {
884 int i = idx % Nx;
885 int j = idx / Nx;
886 int k = j / Ny;
887 j = j % Ny;
888 body(i, j, k);
889 });
890 }
891#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
893 {
894 return RajaOmpWrap3D(Nx, Ny, Nz, body);
895 }
896#endif
897#ifdef MFEM_USE_OPENMP
899 {
900 return OmpWrap3D(Nx, Ny, Nz, body);
901 }
902#endif
903 else
904 {
905 for (int k = 0; k < Nz; ++k)
906 {
907 for (int j = 0; j < Ny; ++j)
908 {
909 for (int i = 0; i < Nx; ++i)
910 {
911 body(i, j, k);
912 }
913 }
914 }
915 }
916}
917
918template<typename lambda>
919inline void forall_switch(bool use_dev, int N, lambda &&body)
920{
921 ForallWrap<1>(use_dev, N, body);
922}
923
924template<typename lambda>
925inline void forall_2D(int N, int X, int Y, lambda &&body)
926{
927 ForallWrap<2>(true, N, body, X, Y, 1);
928}
929
930template<typename lambda>
931inline void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
932{
933 ForallWrap<2>(true, N, body, X, Y, BZ);
934}
935
936template<typename lambda>
937inline void forall_3D(int N, int X, int Y, int Z, lambda &&body)
938{
939 ForallWrap<3>(true, N, body, X, Y, Z, 0);
940}
941
942template<typename lambda>
943inline void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
944{
945 ForallWrap<3>(true, N, body, X, Y, Z, G);
946}
947
948#ifdef MFEM_USE_MPI
949
950// Function mfem::hypre_forall_cpu() similar to mfem::forall, but it always
951// executes on the CPU using sequential or OpenMP-parallel execution based on
952// the hypre build time configuration.
953template<typename lambda>
954inline void hypre_forall_cpu(int N, lambda &&body)
955{
956#ifdef HYPRE_USING_OPENMP
957 #pragma omp parallel for HYPRE_SMP_SCHEDULE
958#endif
959 for (int i = 0; i < N; i++) { body(i); }
960}
961
962// Function mfem::hypre_forall_gpu() similar to mfem::forall, but it always
963// executes on the GPU device that hypre was configured with at build time.
964#if defined(HYPRE_USING_GPU)
965template<typename lambda>
966inline void hypre_forall_gpu(int N, lambda &&body)
967{
968#if defined(HYPRE_USING_CUDA)
969 CuWrap1D(N, body);
970#elif defined(HYPRE_USING_HIP)
971 HipWrap1D(N, body);
972#else
973#error Unknown HYPRE GPU backend!
974#endif
975}
976#endif
977
978// Function mfem::hypre_forall() similar to mfem::forall, but it executes on the
979// device, CPU or GPU, that hypre was configured with at build time (when the
980// HYPRE version is < 2.31.0) or at runtime (when HYPRE was configured with GPU
981// support at build time and HYPRE's version is >= 2.31.0). This selection is
982// generally independent of what device was selected in MFEM's runtime
983// configuration.
984template<typename lambda>
985inline void hypre_forall(int N, lambda &&body)
986{
987#if !defined(HYPRE_USING_GPU)
988 hypre_forall_cpu(N, body);
989#elif MFEM_HYPRE_VERSION < 23100
990 hypre_forall_gpu(N, body);
991#else // HYPRE_USING_GPU is defined and MFEM_HYPRE_VERSION >= 23100
992 if (!HypreUsingGPU())
993 {
994 hypre_forall_cpu(N, body);
995 }
996 else
997 {
998 hypre_forall_gpu(N, body);
999 }
1000#endif
1001}
1002
1003// Return the most general MemoryClass that can be used with mfem::hypre_forall
1004// kernels. The returned MemoryClass is the same as the one returned by
1005// GerHypreMemoryClass() except when hypre is configured to use UVM, in which
1006// case this function returns MemoryClass::HOST or MemoryClass::DEVICE depending
1007// on the result of HypreUsingGPU().
1012
1013#endif // MFEM_USE_MPI
1014
1015} // namespace mfem
1016
1017#endif // MFEM_FORALL_HPP
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition device.hpp:262
RAJA::LaunchPolicy< RAJA::cuda_launch_t< true > > cuda_launch_policy
RAJA Cuda and Hip backends.
Definition forall.hpp:280
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
Definition forall.hpp:501
RAJA::LaunchPolicy< RAJA::hip_launch_t< true > > hip_launch_policy
Definition forall.hpp:289
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Definition forall.hpp:547
MemoryClass GetHypreForallMemoryClass()
Definition forall.hpp:1008
void RajaOmpWrap3D(const int Nx, const int Ny, const int Nz, HBODY &&h_body)
Definition forall.hpp:525
void OmpWrap3D(const int Nx, const int Ny, const int Nz, HBODY &&h_body)
Definition forall.hpp:253
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:406
void hypre_forall_cpu(int N, lambda &&body)
Definition forall.hpp:954
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:306
RAJA::LoopPolicy< RAJA::cuda_thread_z_direct > cuda_threads_z
Definition forall.hpp:284
void CuWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:594
MemoryClass
Memory classes identify sets of memory types.
void RajaOmpWrap2D(const int Nx, const int Ny, HBODY &&h_body)
Definition forall.hpp:507
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:340
void RajaHipWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:399
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
Definition forall.hpp:931
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:603
void hypre_forall_gpu(int N, lambda &&body)
Definition forall.hpp:966
void OmpWrap2D(const int Nx, const int Ny, HBODY &&h_body)
Definition forall.hpp:232
internal::DofQuadLimits_CUDA DofQuadLimits
Maximum number of 1D DOFs or quadrature points for the architecture currently being compiled for (use...
Definition forall.hpp:100
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
Definition forall.hpp:761
void forall_2D(int N, int X, int Y, lambda &&body)
Definition forall.hpp:925
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:710
RAJA::LoopPolicy< RAJA::hip_thread_z_direct > hip_threads_z
Definition forall.hpp:293
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:615
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:699
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition forall.hpp:937
void hypre_forall(int N, lambda &&body)
Definition forall.hpp:985
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.
Definition forall.hpp:216
bool HypreUsingGPU()
Return true if HYPRE is configured to use GPU.
void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
Definition forall.hpp:943
RAJA::LoopPolicy< RAJA::hip_block_x_direct > hip_teams_x
Definition forall.hpp:291
void RajaCuWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:299
void HipWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:690
void forall(int N, lambda &&body)
Definition forall.hpp:839
void forall_switch(bool use_dev, int N, lambda &&body)
Definition forall.hpp:919
RAJA::LoopPolicy< RAJA::cuda_block_x_direct > cuda_teams_x
Definition forall.hpp:282
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:440
struct s_NavierContext ctx
@ RAJA_OMP
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES.
Definition device.hpp:48
@ RAJA_CUDA
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
Definition device.hpp:51
@ DEBUG_DEVICE
[device] Debug backend: host memory is READ/WRITE protected while a device is in use....
Definition device.hpp:78
@ RAJA_CPU
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES.
Definition device.hpp:45
@ OMP
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
Definition device.hpp:38
@ HIP
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
Definition device.hpp:42
@ RAJA_HIP
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
Definition device.hpp:54
@ CUDA
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
Definition device.hpp:40
@ DEVICE_MASK
Biwise-OR of all device backends.
Definition device.hpp:99
@ HIP_MASK
Biwise-OR of all HIP backends.
Definition device.hpp:93
@ CUDA_MASK
Biwise-OR of all CUDA backends.
Definition device.hpp:91
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:632
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:643
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:654
Maximum number of 1D DOFs or quadrature points for the current runtime configuration of the Device (u...
Definition forall.hpp:117
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition forall.hpp:128
int HCURL_MAX_D1D
Maximum number of 1D nodal points for H(curl).
Definition forall.hpp:120
int HCURL_MAX_Q1D
Maximum number of 1D quadrature points for H(curl).
Definition forall.hpp:121
int HDIV_MAX_Q1D
Maximum number of 1D quadrature points for H(div).
Definition forall.hpp:123
int MAX_INTERP_1D
Maximum number of points for use in QuadratureInterpolator.
Definition forall.hpp:124
int HDIV_MAX_D1D
Maximum number of 1D nodal points for H(div).
Definition forall.hpp:122
int MAX_DET_1D
Maximum number of points for determinant computation in QuadratureInterpolator.
Definition forall.hpp:125
int MAX_D1D
Maximum number of 1D nodal points.
Definition forall.hpp:118
int MAX_Q1D
Maximum number of 1D quadrature points.
Definition forall.hpp:119
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:727
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:738
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:749
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:366
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:377
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:388
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:466
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:477
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:488