html/forall_8hpp_source.html

 // Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
 // at the Lawrence Livermore National Laboratory. All Rights reserved. See files
 // LICENSE and NOTICE for details. LLNL-CODE-806117.
 //
 // This file is part of the MFEM library. For more information and source code
 // availability visit https://mfem.org.
 //
 // MFEM is free software; you can redistribute it and/or modify it under the
 // terms of the BSD-3 license. We welcome feedback and contributions, see file
 // CONTRIBUTING.md for details.

 #ifndef MFEM_FORALL_HPP
 #define MFEM_FORALL_HPP

 #include "../config/config.hpp"
 #include "annotation.hpp"
 #include "error.hpp"
 #include "backends.hpp"
 #include "device.hpp"
 #include "mem_manager.hpp"
 #include "../linalg/dtensor.hpp"

 namespace mfem
 {

 // The following DofQuadLimit_ structs define the maximum values of D1D and Q1D
 // often used in the "fallback kernels" for partial assembly. Different limits
 // take effect for different architectures. The limits should be queried using
 // the public interface in DeviceDofQuadLimits or DofQuadLimits, and generally
 // not be directly accessing the structs defined below.
 //
 // In host code, the limits associated with the currently configured Device can
 // be accessed using DeviceDofQuadLimits::Get().
 //
 // In mfem::forall kernels or MFEM_HOST_DEVICE functions, the limits
 // corresponding to the architecture the function is being compiled for can be
 // accessed as static constexpr variables using the type alias DofQuadLimits.

 namespace internal
 {

 struct DofQuadLimits_CUDA
 {
    static constexpr int MAX_D1D = 14;
    static constexpr int MAX_Q1D = 14;
    static constexpr int HCURL_MAX_D1D = 5;
    static constexpr int HCURL_MAX_Q1D = 6;
    static constexpr int HDIV_MAX_D1D = 5;
    static constexpr int HDIV_MAX_Q1D = 6;
    static constexpr int MAX_INTERP_1D = 8;
    static constexpr int MAX_DET_1D = 6;
 };

 struct DofQuadLimits_HIP
 {
    static constexpr int MAX_D1D = 10;
    static constexpr int MAX_Q1D = 10;
    static constexpr int HCURL_MAX_D1D = 5;
    static constexpr int HCURL_MAX_Q1D = 5;
    static constexpr int HDIV_MAX_D1D = 5;
    static constexpr int HDIV_MAX_Q1D = 6;
    static constexpr int MAX_INTERP_1D = 8;
    static constexpr int MAX_DET_1D = 6;
 };

 struct DofQuadLimits_CPU
 {
 #ifndef _WIN32
    static constexpr int MAX_D1D = 24;
    static constexpr int MAX_Q1D = 24;
 #else
    static constexpr int MAX_D1D = 14;
    static constexpr int MAX_Q1D = 14;
 #endif
    static constexpr int HCURL_MAX_D1D = 10;
    static constexpr int HCURL_MAX_Q1D = 10;
    static constexpr int HDIV_MAX_D1D = 10;
    static constexpr int HDIV_MAX_Q1D = 10;
    static constexpr int MAX_INTERP_1D = MAX_D1D;
    static constexpr int MAX_DET_1D = MAX_D1D;
 };

 } // namespace internal

 /// @brief Maximum number of 1D DOFs or quadrature points for the architecture
 /// currently being compiled for (used in fallback kernels).
 ///
 /// DofQuadLimits provides access to the limits as static constexpr member
 /// variables for use in mfem::forall kernels or MFEM_HOST_DEVICE functions.
 ///
 /// @sa For accessing the limits according to the runtime configuration of the
 /// Device, see DeviceDofQuadLimits.
 #if defined(__CUDA_ARCH__)
 using DofQuadLimits = internal::DofQuadLimits_CUDA;
 #elif defined(__HIP_DEVICE_COMPILE__)
 using DofQuadLimits = internal::DofQuadLimits_HIP;
 #else
 using DofQuadLimits = internal::DofQuadLimits_CPU;
 #endif

 /// @brief Maximum number of 1D DOFs or quadrature points for the current
 /// runtime configuration of the Device (used in fallback kernels).
 ///
 /// DeviceDofQuadLimits can be used in host code to query the limits for the
 /// configured device (e.g. when the user has selected GPU execution at
 /// runtime).
 ///
 /// @sa For accessing the limits according to the current compiler pass, see
 /// DofQuadLimits.
 struct DeviceDofQuadLimits
 {
    int MAX_D1D; ///< Maximum number of 1D nodal points.
    int MAX_Q1D; ///< Maximum number of 1D quadrature points.
    int HCURL_MAX_D1D; ///< Maximum number of 1D nodal points for H(curl).
    int HCURL_MAX_Q1D; ///< Maximum number of 1D quadrature points for H(curl).
    int HDIV_MAX_D1D; ///< Maximum number of 1D nodal points for H(div).
    int HDIV_MAX_Q1D; ///< Maximum number of 1D quadrature points for H(div).
    int MAX_INTERP_1D; ///< Maximum number of points for use in QuadratureInterpolator.
    int MAX_DET_1D; ///< Maximum number of points for determinant computation in QuadratureInterpolator.

    /// Return a const reference to the DeviceDofQuadLimits singleton.
    static const DeviceDofQuadLimits &Get()
    {
       static const DeviceDofQuadLimits dof_quad_limits;
       return dof_quad_limits;
    }

 private:
    /// Initialize the limits depending on the configuration of the Device.
    DeviceDofQuadLimits()
    {
       if (Device::Allows(Backend::CUDA_MASK)) { Populate<internal::DofQuadLimits_CUDA>(); }
       else if (Device::Allows(Backend::HIP_MASK)) { Populate<internal::DofQuadLimits_HIP>(); }
       else { Populate<internal::DofQuadLimits_CPU>(); }
    }

    /// @brief Set the limits using the static members of the type @a T.
    ///
    /// @a T should be one of DofQuadLimits_CUDA, DofQuadLimits_HIP, or
    /// DofQuadLimits_CPU.
    template <typename T> void Populate()
    {
       MAX_D1D = T::MAX_D1D;
       MAX_Q1D = T::MAX_Q1D;
       HCURL_MAX_D1D = T::HCURL_MAX_D1D;
       HCURL_MAX_Q1D = T::HCURL_MAX_Q1D;
       HDIV_MAX_D1D = T::HDIV_MAX_D1D;
       HDIV_MAX_Q1D = T::HDIV_MAX_Q1D;
       MAX_INTERP_1D = T::MAX_INTERP_1D;
       MAX_DET_1D = T::MAX_DET_1D;
    }
 };

 // MFEM pragma macros that can be used inside MFEM_FORALL macros.
 #define MFEM_PRAGMA(X) _Pragma(#X)

 // MFEM_UNROLL pragma macro that can be used inside MFEM_FORALL macros.
 #if defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__)
 #define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N))
 #else
 #define MFEM_UNROLL(N)
 #endif

 // MFEM_GPU_FORALL: "parallel for" executed with CUDA or HIP based on the MFEM
 // build-time configuration (MFEM_USE_CUDA or MFEM_USE_HIP). If neither CUDA nor
 // HIP is enabled, this macro is a no-op.
 #if defined(MFEM_USE_CUDA)
 #define MFEM_GPU_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE      \
                                        (int i) {__VA_ARGS__})
 #elif defined(MFEM_USE_HIP)
 #define MFEM_GPU_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE     \
                                         (int i) {__VA_ARGS__})
 #else
 #define MFEM_GPU_FORALL(i, N,...) do { } while (false)
 #endif

 // Implementation of MFEM's "parallel for" (forall) device/host kernel
 // interfaces supporting RAJA, CUDA, OpenMP, and sequential backends.

 // The MFEM_FORALL wrapper
 #define MFEM_FORALL(i,N,...) \
    ForallWrap<1>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})

 // MFEM_FORALL with a 2D CUDA block
 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
    ForallWrap<2>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,BZ)

 // MFEM_FORALL with a 3D CUDA block
 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
    ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z)

 // MFEM_FORALL with a 3D CUDA block and grid
 // With G=0, this is the same as MFEM_FORALL_3D(i,N,X,Y,Z,...)
 #define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \
    ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z,G)

 // MFEM_FORALL that uses the basic CPU backend when use_dev is false. See for
 // example the functions in vector.cpp, where we don't want to use the mfem
 // device for operations on small vectors.
 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
    ForallWrap<1>(use_dev,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})


 /// OpenMP backend
 template <typename HBODY>
 void OmpWrap(const int N, HBODY &&h_body)
 {
 #ifdef MFEM_USE_OPENMP
    #pragma omp parallel for
    for (int k = 0; k < N; k++)
    {
       h_body(k);
    }
 #else
    MFEM_CONTRACT_VAR(N);
    MFEM_CONTRACT_VAR(h_body);
    MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");
 #endif
 }


 /// RAJA Cuda and Hip backends
 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
 using cuda_launch_policy =
    RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;
 using cuda_teams_x =
    RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
 using cuda_threads_z =
    RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>;
 #endif

 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
 using hip_launch_policy =
    RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>;
 using hip_teams_x =
    RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
 using hip_threads_z =
    RAJA::LoopPolicy<RAJA::hip_thread_z_direct>;
 #endif

 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
 template <const int BLOCKS = MFEM_CUDA_BLOCKS, typename DBODY>
 void RajaCuWrap1D(const int N, DBODY &&d_body)
 {
    //true denotes asynchronous kernel
    RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
 }

 template <typename DBODY>
 void RajaCuWrap2D(const int N, DBODY &&d_body,
                   const int X, const int Y, const int BZ)
 {
    MFEM_VERIFY(N>0, "");
    MFEM_VERIFY(BZ>0, "");
    const int G = (N+BZ-1)/BZ;

    using namespace RAJA;
    using RAJA::RangeSegment;

    launch<cuda_launch_policy>
    (LaunchParams(Teams(G), Threads(X, Y, BZ)),
     [=] RAJA_DEVICE (LaunchContext ctx)
    {

       loop<cuda_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
       {

          loop<cuda_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
          {

             const int k = n*BZ + tz;
             if (k >= N) { return; }
             d_body(k);

          });

       });

    });

    MFEM_GPU_CHECK(cudaGetLastError());
 }

 template <typename DBODY>
 void RajaCuWrap3D(const int N, DBODY &&d_body,
                   const int X, const int Y, const int Z, const int G)
 {
    MFEM_VERIFY(N>0, "");
    const int GRID = G == 0 ? N : G;
    using namespace RAJA;
    using RAJA::RangeSegment;

    launch<cuda_launch_policy>
    (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
     [=] RAJA_DEVICE (LaunchContext ctx)
    {

       loop<cuda_teams_x>(ctx, RangeSegment(0, N), d_body);

    });

    MFEM_GPU_CHECK(cudaGetLastError());
 }

 template <int Dim>
 struct RajaCuWrap;

 template <>
 struct RajaCuWrap<1>
 {
    template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
    static void run(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
    {
       RajaCuWrap1D<BLCK>(N, d_body);
    }
 };

 template <>
 struct RajaCuWrap<2>
 {
    template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
    static void run(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
    {
       RajaCuWrap2D(N, d_body, X, Y, Z);
    }
 };

 template <>
 struct RajaCuWrap<3>
 {
    template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
    static void run(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
    {
       RajaCuWrap3D(N, d_body, X, Y, Z, G);
    }
 };

 #endif

 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
 template <const int BLOCKS = MFEM_HIP_BLOCKS, typename DBODY>
 void RajaHipWrap1D(const int N, DBODY &&d_body)
 {
    //true denotes asynchronous kernel
    RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);
 }

 template <typename DBODY>
 void RajaHipWrap2D(const int N, DBODY &&d_body,
                    const int X, const int Y, const int BZ)
 {
    MFEM_VERIFY(N>0, "");
    MFEM_VERIFY(BZ>0, "");
    const int G = (N+BZ-1)/BZ;

    using namespace RAJA;
    using RAJA::RangeSegment;

    launch<hip_launch_policy>
    (LaunchParams(Teams(G), Threads(X, Y, BZ)),
     [=] RAJA_DEVICE (LaunchContext ctx)
    {

       loop<hip_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)
       {

          loop<hip_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)
          {

             const int k = n*BZ + tz;
             if (k >= N) { return; }
             d_body(k);

          });

       });

    });

    MFEM_GPU_CHECK(hipGetLastError());
 }

 template <typename DBODY>
 void RajaHipWrap3D(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
 {
    MFEM_VERIFY(N>0, "");
    const int GRID = G == 0 ? N : G;
    using namespace RAJA;
    using RAJA::RangeSegment;

    launch<hip_launch_policy>
    (LaunchParams(Teams(GRID), Threads(X, Y, Z)),
     [=] RAJA_DEVICE (LaunchContext ctx)
    {

       loop<hip_teams_x>(ctx, RangeSegment(0, N), d_body);

    });

    MFEM_GPU_CHECK(hipGetLastError());
 }

 template <int Dim>
 struct RajaHipWrap;

 template <>
 struct RajaHipWrap<1>
 {
    template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
    static void run(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
    {
       RajaHipWrap1D<BLCK>(N, d_body);
    }
 };

 template <>
 struct RajaHipWrap<2>
 {
    template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
    static void run(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
    {
       RajaHipWrap2D(N, d_body, X, Y, Z);
    }
 };

 template <>
 struct RajaHipWrap<3>
 {
    template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
    static void run(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
    {
       RajaHipWrap3D(N, d_body, X, Y, Z, G);
    }
 };

 #endif

 /// RAJA OpenMP backend
 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)

 template <typename HBODY>
 void RajaOmpWrap(const int N, HBODY &&h_body)
 {
    RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
 }

 #endif


 /// RAJA sequential loop backend
 template <typename HBODY>
 void RajaSeqWrap(const int N, HBODY &&h_body)
 {
 #ifdef MFEM_USE_RAJA
    RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
 #else
    MFEM_CONTRACT_VAR(N);
    MFEM_CONTRACT_VAR(h_body);
    MFEM_ABORT("RAJA requested but RAJA is not enabled!");
 #endif
 }


 /// CUDA backend
 #ifdef MFEM_USE_CUDA

 template <typename BODY> __global__ static
 void CuKernel1D(const int N, BODY body)
 {
    const int k = blockDim.x*blockIdx.x + threadIdx.x;
    if (k >= N) { return; }
    body(k);
 }

 template <typename BODY> __global__ static
 void CuKernel2D(const int N, BODY body)
 {
    const int k = blockIdx.x*blockDim.z + threadIdx.z;
    if (k >= N) { return; }
    body(k);
 }

 template <typename BODY> __global__ static
 void CuKernel3D(const int N, BODY body)
 {
    for (int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }
 }

 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
 void CuWrap1D(const int N, DBODY &&d_body)
 {
    if (N==0) { return; }
    const int GRID = (N+BLCK-1)/BLCK;
    CuKernel1D<<<GRID,BLCK>>>(N, d_body);
    MFEM_GPU_CHECK(cudaGetLastError());
 }

 template <typename DBODY>
 void CuWrap2D(const int N, DBODY &&d_body,
               const int X, const int Y, const int BZ)
 {
    if (N==0) { return; }
    MFEM_VERIFY(BZ>0, "");
    const int GRID = (N+BZ-1)/BZ;
    const dim3 BLCK(X,Y,BZ);
    CuKernel2D<<<GRID,BLCK>>>(N,d_body);
    MFEM_GPU_CHECK(cudaGetLastError());
 }

 template <typename DBODY>
 void CuWrap3D(const int N, DBODY &&d_body,
               const int X, const int Y, const int Z, const int G)
 {
    if (N==0) { return; }
    const int GRID = G == 0 ? N : G;
    const dim3 BLCK(X,Y,Z);
    CuKernel3D<<<GRID,BLCK>>>(N,d_body);
    MFEM_GPU_CHECK(cudaGetLastError());
 }

 template <int Dim>
 struct CuWrap;

 template <>
 struct CuWrap<1>
 {
    template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
    static void run(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
    {
       CuWrap1D<BLCK>(N, d_body);
    }
 };

 template <>
 struct CuWrap<2>
 {
    template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
    static void run(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
    {
       CuWrap2D(N, d_body, X, Y, Z);
    }
 };

 template <>
 struct CuWrap<3>
 {
    template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
    static void run(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
    {
       CuWrap3D(N, d_body, X, Y, Z, G);
    }
 };

 #endif // MFEM_USE_CUDA


 /// HIP backend
 #ifdef MFEM_USE_HIP

 template <typename BODY> __global__ static
 void HipKernel1D(const int N, BODY body)
 {
    const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;
    if (k >= N) { return; }
    body(k);
 }

 template <typename BODY> __global__ static
 void HipKernel2D(const int N, BODY body)
 {
    const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;
    if (k >= N) { return; }
    body(k);
 }

 template <typename BODY> __global__ static
 void HipKernel3D(const int N, BODY body)
 {
    for (int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }
 }

 template <const int BLCK = MFEM_HIP_BLOCKS, typename DBODY>
 void HipWrap1D(const int N, DBODY &&d_body)
 {
    if (N==0) { return; }
    const int GRID = (N+BLCK-1)/BLCK;
    hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,0,N,d_body);
    MFEM_GPU_CHECK(hipGetLastError());
 }

 template <typename DBODY>
 void HipWrap2D(const int N, DBODY &&d_body,
                const int X, const int Y, const int BZ)
 {
    if (N==0) { return; }
    const int GRID = (N+BZ-1)/BZ;
    const dim3 BLCK(X,Y,BZ);
    hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,0,N,d_body);
    MFEM_GPU_CHECK(hipGetLastError());
 }

 template <typename DBODY>
 void HipWrap3D(const int N, DBODY &&d_body,
                const int X, const int Y, const int Z, const int G)
 {
    if (N==0) { return; }
    const int GRID = G == 0 ? N : G;
    const dim3 BLCK(X,Y,Z);
    hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,0,N,d_body);
    MFEM_GPU_CHECK(hipGetLastError());
 }

 template <int Dim>
 struct HipWrap;

 template <>
 struct HipWrap<1>
 {
    template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
    static void run(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
    {
       HipWrap1D<BLCK>(N, d_body);
    }
 };

 template <>
 struct HipWrap<2>
 {
    template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
    static void run(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
    {
       HipWrap2D(N, d_body, X, Y, Z);
    }
 };

 template <>
 struct HipWrap<3>
 {
    template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
    static void run(const int N, DBODY &&d_body,
                    const int X, const int Y, const int Z, const int G)
    {
       HipWrap3D(N, d_body, X, Y, Z, G);
    }
 };

 #endif // MFEM_USE_HIP


 /// The forall kernel body wrapper
 template <const int DIM, typename d_lambda, typename h_lambda>
 inline void ForallWrap(const bool use_dev, const int N,
                        d_lambda &&d_body, h_lambda &&h_body,
                        const int X=0, const int Y=0, const int Z=0,
                        const int G=0)
 {
    MFEM_CONTRACT_VAR(X);
    MFEM_CONTRACT_VAR(Y);
    MFEM_CONTRACT_VAR(Z);
    MFEM_CONTRACT_VAR(G);
    MFEM_CONTRACT_VAR(d_body);
    if (!use_dev) { goto backend_cpu; }

 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
    // If Backend::RAJA_CUDA is allowed, use it
    if (Device::Allows(Backend::RAJA_CUDA))
    {
       return RajaCuWrap<DIM>::run(N, d_body, X, Y, Z, G);
    }
 #endif

 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)
    // If Backend::RAJA_HIP is allowed, use it
    if (Device::Allows(Backend::RAJA_HIP))
    {
       return RajaHipWrap<DIM>::run(N, d_body, X, Y, Z, G);
    }
 #endif

 #ifdef MFEM_USE_CUDA
    // If Backend::CUDA is allowed, use it
    if (Device::Allows(Backend::CUDA))
    {
       return CuWrap<DIM>::run(N, d_body, X, Y, Z, G);
    }
 #endif

 #ifdef MFEM_USE_HIP
    // If Backend::HIP is allowed, use it
    if (Device::Allows(Backend::HIP))
    {
       return HipWrap<DIM>::run(N, d_body, X, Y, Z, G);
    }
 #endif

    // If Backend::DEBUG_DEVICE is allowed, use it
    if (Device::Allows(Backend::DEBUG_DEVICE)) { goto backend_cpu; }

 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
    // If Backend::RAJA_OMP is allowed, use it
    if (Device::Allows(Backend::RAJA_OMP)) { return RajaOmpWrap(N, h_body); }
 #endif

 #ifdef MFEM_USE_OPENMP
    // If Backend::OMP is allowed, use it
    if (Device::Allows(Backend::OMP)) { return OmpWrap(N, h_body); }
 #endif

 #ifdef MFEM_USE_RAJA
    // If Backend::RAJA_CPU is allowed, use it
    if (Device::Allows(Backend::RAJA_CPU)) { return RajaSeqWrap(N, h_body); }
 #endif

 backend_cpu:
    // Handle Backend::CPU. This is also a fallback for any allowed backends not
    // handled above, e.g. OCCA_CPU with configuration 'occa-cpu,cpu', or
    // OCCA_OMP with configuration 'occa-omp,cpu'.
    for (int k = 0; k < N; k++) { h_body(k); }
 }

 template <const int DIM, typename lambda>
 inline void ForallWrap(const bool use_dev, const int N, lambda &&body,
                        const int X=0, const int Y=0, const int Z=0,
                        const int G=0)
 {
    ForallWrap<DIM>(use_dev, N, body, body, X, Y, Z, G);
 }

 template<typename lambda>
 inline void forall(int N, lambda &&body) { ForallWrap<1>(true, N, body); }

 template<typename lambda>
 inline void forall_switch(bool use_dev, int N, lambda &&body)
 {
    ForallWrap<1>(use_dev, N, body);
 }

 template<typename lambda>
 inline void forall_2D(int N, int X, int Y, lambda &&body)
 {
    ForallWrap<2>(true, N, body, X, Y, 1);
 }

 template<typename lambda>
 inline void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
 {
    ForallWrap<2>(true, N, body, X, Y, BZ);
 }

 template<typename lambda>
 inline void forall_3D(int N, int X, int Y, int Z, lambda &&body)
 {
    ForallWrap<3>(true, N, body, X, Y, Z, 0);
 }

 template<typename lambda>
 inline void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
 {
    ForallWrap<3>(true, N, body, X, Y, Z, G);
 }

 } // namespace mfem

 #endif // MFEM_FORALL_HPP
mfem::CuWrap3D
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:518

mfem::forall_3D
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition: forall.hpp:763

mfem::HipWrap< 1 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:630

device.hpp

mfem::HipWrap3D
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:613

backends.hpp

mfem::forall_2D
void forall_2D(int N, int X, int Y, lambda &&body)
Definition: forall.hpp:751

mfem::RajaCuWrap
Definition: forall.hpp:306

mfem::RajaSeqWrap
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Definition: forall.hpp:459

mfem::cuda_launch_policy
RAJA::LaunchPolicy< RAJA::cuda_launch_t< true > > cuda_launch_policy
RAJA Cuda and Hip backends.
Definition: forall.hpp:225

mfem::RajaCuWrap< 1 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:312

mfem::hip_launch_policy
RAJA::LaunchPolicy< RAJA::hip_launch_t< true > > hip_launch_policy
Definition: forall.hpp:234

mfem::RajaHipWrap< 2 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:425

mfem::RajaOmpWrap
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
Definition: forall.hpp:449

mfem::CuWrap2D
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:506

mfem::Backend::RAJA_OMP
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES...
Definition: device.hpp:45

mfem::RajaHipWrap< 3 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:436

mfem::Backend::HIP_MASK
Biwise-OR of all HIP backends.
Definition: device.hpp:90

mem_manager.hpp

mfem::HipWrap1D
void HipWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:593

mfem::RajaHipWrap
Definition: forall.hpp:408

mfem::DeviceDofQuadLimits::HDIV_MAX_D1D
int HDIV_MAX_D1D
Maximum number of 1D nodal points for H(div).
Definition: forall.hpp:116

mfem::RajaCuWrap3D
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:285

mfem::Backend::RAJA_CUDA
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
Definition: device.hpp:48

mfem::CuWrap< 2 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:546

mfem::HipWrap
Definition: forall.hpp:624

annotation.hpp

mfem::cuda_threads_z
RAJA::LoopPolicy< RAJA::cuda_thread_z_direct > cuda_threads_z
Definition: forall.hpp:229

mfem::DeviceDofQuadLimits::HCURL_MAX_Q1D
int HCURL_MAX_Q1D
Maximum number of 1D quadrature points for H(curl).
Definition: forall.hpp:115

mfem::DeviceDofQuadLimits::MAX_Q1D
int MAX_Q1D
Maximum number of 1D quadrature points.
Definition: forall.hpp:113

mfem::DeviceDofQuadLimits::MAX_DET_1D
int MAX_DET_1D
Maximum number of points for determinant computation in QuadratureInterpolator.
Definition: forall.hpp:119

mfem::forall_3D_grid
void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
Definition: forall.hpp:769

mfem::RajaHipWrap3D
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:387

mfem
Definition: CodeDocumentation.dox:1

ctx
struct s_NavierContext ctx

mfem::DofQuadLimits
internal::DofQuadLimits_CUDA DofQuadLimits
Maximum number of 1D DOFs or quadrature points for the architecture currently being compiled for (use...
Definition: forall.hpp:94

mfem::forall_2D_batch
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
Definition: forall.hpp:757

mfem::Backend::RAJA_CPU
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES...
Definition: device.hpp:42

mfem::forall_switch
void forall_switch(bool use_dev, int N, lambda &&body)
Definition: forall.hpp:745

mfem::Backend::CUDA_MASK
Biwise-OR of all CUDA backends.
Definition: device.hpp:88

mfem::CuWrap< 1 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:535

mfem::DeviceDofQuadLimits::MAX_D1D
int MAX_D1D
Maximum number of 1D nodal points.
Definition: forall.hpp:112

mfem::HipWrap< 3 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:652

mfem::forall
void forall(int N, lambda &&body)
Definition: forall.hpp:742

mfem::DeviceDofQuadLimits
Maximum number of 1D DOFs or quadrature points for the current runtime configuration of the Device (u...
Definition: forall.hpp:110

mfem::Backend::OMP
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
Definition: device.hpp:35

mfem::HipWrap< 2 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:641

mfem::Device::Allows
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition: device.hpp:258

mfem::CuWrap1D
void CuWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:497

mfem::DeviceDofQuadLimits::Get
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition: forall.hpp:122

error.hpp

mfem::cuda_teams_x
RAJA::LoopPolicy< RAJA::cuda_block_x_direct > cuda_teams_x
Definition: forall.hpp:227

mfem::CuWrap< 3 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:557

mfem::hip_teams_x
RAJA::LoopPolicy< RAJA::hip_block_x_direct > hip_teams_x
Definition: forall.hpp:236

mfem::RajaCuWrap< 2 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:323

mfem::HipWrap2D
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:602

mfem::RajaHipWrap2D
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:352

mfem::DeviceDofQuadLimits::MAX_INTERP_1D
int MAX_INTERP_1D
Maximum number of points for use in QuadratureInterpolator.
Definition: forall.hpp:118

mfem::DeviceDofQuadLimits::HDIV_MAX_Q1D
int HDIV_MAX_Q1D
Maximum number of 1D quadrature points for H(div).
Definition: forall.hpp:117

mfem::RajaCuWrap1D
void RajaCuWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:243

mfem::RajaCuWrap2D
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:250

mfem::DeviceDofQuadLimits::HCURL_MAX_D1D
int HCURL_MAX_D1D
Maximum number of 1D nodal points for H(curl).
Definition: forall.hpp:114

mfem::RajaHipWrap1D
void RajaHipWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:345

mfem::RajaCuWrap< 3 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:334

mfem::Backend::RAJA_HIP
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
Definition: device.hpp:51

mfem::RajaHipWrap< 1 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition: forall.hpp:414

mfem::hip_threads_z
RAJA::LoopPolicy< RAJA::hip_thread_z_direct > hip_threads_z
Definition: forall.hpp:238

mfem::Backend::HIP
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
Definition: device.hpp:39

mfem::Backend::CUDA
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
Definition: device.hpp:37

mfem::ForallWrap
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
Definition: forall.hpp:664

mfem::CuWrap
Definition: forall.hpp:529

mfem::Backend::DEBUG_DEVICE
[device] Debug backend: host memory is READ/WRITE protected while a device is in use. It allows to test the "device" code-path (using separate host/device memory pools and host <-> device transfers) without any GPU hardware. As &#39;DEBUG&#39; is sometimes used as a macro, _DEVICE has been added to avoid conflicts.
Definition: device.hpp:75

mfem::OmpWrap
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.
Definition: forall.hpp:206