4.8/forall_8hpp_source.html

// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced

// at the Lawrence Livermore National Laboratory. All Rights reserved. See files

// LICENSE and NOTICE for details. LLNL-CODE-806117.

//

// This file is part of the MFEM library. For more information and source code

// availability visit https://mfem.org.

//

// MFEM is free software; you can redistribute it and/or modify it under the

// terms of the BSD-3 license. We welcome feedback and contributions, see file

// CONTRIBUTING.md for details.


#ifndef MFEM_FORALL_HPP

#define MFEM_FORALL_HPP


#include "../config/config.hpp"

#include "annotation.hpp"

#include "error.hpp"

#include "backends.hpp"

#include "device.hpp"

#include "mem_manager.hpp"

#include "../linalg/dtensor.hpp"

#ifdef MFEM_USE_MPI

#include <_hypre_utilities.h>

#endif


#include "array.hpp"

#include "reducers.hpp"


namespace mfem

{


// The following DofQuadLimit_ structs define the maximum values of D1D and Q1D

// often used in the "fallback kernels" for partial assembly. Different limits

// take effect for different architectures. The limits should be queried using

// the public interface in DeviceDofQuadLimits or DofQuadLimits, and generally

// not be directly accessing the structs defined below.

//

// In host code, the limits associated with the currently configured Device can

// be accessed using DeviceDofQuadLimits::Get().

//

// In mfem::forall kernels or MFEM_HOST_DEVICE functions, the limits

// corresponding to the architecture the function is being compiled for can be

// accessed as static constexpr variables using the type alias DofQuadLimits.


namespace internal

{


struct DofQuadLimits_CUDA

{

   static constexpr int MAX_D1D = 14;

   static constexpr int MAX_Q1D = 14;

   static constexpr int HCURL_MAX_D1D = 5;

   static constexpr int HCURL_MAX_Q1D = 6;

   static constexpr int HDIV_MAX_D1D = 5;

   static constexpr int HDIV_MAX_Q1D = 6;

   static constexpr int MAX_INTERP_1D = 8;

   static constexpr int MAX_DET_1D = 6;

};


struct DofQuadLimits_HIP

{

   static constexpr int MAX_D1D = 10;

   static constexpr int MAX_Q1D = 10;

   static constexpr int HCURL_MAX_D1D = 5;

   static constexpr int HCURL_MAX_Q1D = 5;

   static constexpr int HDIV_MAX_D1D = 5;

   static constexpr int HDIV_MAX_Q1D = 6;

   static constexpr int MAX_INTERP_1D = 8;

   static constexpr int MAX_DET_1D = 6;

};


struct DofQuadLimits_CPU

{

#ifndef _WIN32

   static constexpr int MAX_D1D = 24;

   static constexpr int MAX_Q1D = 24;

#else

   static constexpr int MAX_D1D = 14;

   static constexpr int MAX_Q1D = 14;

#endif

   static constexpr int HCURL_MAX_D1D = 10;

   static constexpr int HCURL_MAX_Q1D = 10;

   static constexpr int HDIV_MAX_D1D = 10;

   static constexpr int HDIV_MAX_Q1D = 10;

   static constexpr int MAX_INTERP_1D = MAX_D1D;

   static constexpr int MAX_DET_1D = MAX_D1D;

};


} // namespace internal


/// @brief Maximum number of 1D DOFs or quadrature points for the architecture

/// currently being compiled for (used in fallback kernels).

///

/// DofQuadLimits provides access to the limits as static constexpr member

/// variables for use in mfem::forall kernels or MFEM_HOST_DEVICE functions.

///

/// @sa For accessing the limits according to the runtime configuration of the

/// Device, see DeviceDofQuadLimits.

#if defined(__CUDA_ARCH__)

using DofQuadLimits = internal::DofQuadLimits_CUDA;

#elif defined(__HIP_DEVICE_COMPILE__)

using DofQuadLimits = internal::DofQuadLimits_HIP;

#else

using DofQuadLimits = internal::DofQuadLimits_CPU;

#endif


/// @brief Maximum number of 1D DOFs or quadrature points for the current

/// runtime configuration of the Device (used in fallback kernels).

///

/// DeviceDofQuadLimits can be used in host code to query the limits for the

/// configured device (e.g. when the user has selected GPU execution at

/// runtime).

///

/// @sa For accessing the limits according to the current compiler pass, see

/// DofQuadLimits.


struct DeviceDofQuadLimits

{

   int MAX_D1D; ///< Maximum number of 1D nodal points.

   int MAX_Q1D; ///< Maximum number of 1D quadrature points.

   int HCURL_MAX_D1D; ///< Maximum number of 1D nodal points for H(curl).

   int HCURL_MAX_Q1D; ///< Maximum number of 1D quadrature points for H(curl).

   int HDIV_MAX_D1D; ///< Maximum number of 1D nodal points for H(div).

   int HDIV_MAX_Q1D; ///< Maximum number of 1D quadrature points for H(div).

   int MAX_INTERP_1D; ///< Maximum number of points for use in QuadratureInterpolator.

   int MAX_DET_1D; ///< Maximum number of points for determinant computation in QuadratureInterpolator.


   /// Return a const reference to the DeviceDofQuadLimits singleton.


   static const DeviceDofQuadLimits &Get()

   {

      static const DeviceDofQuadLimits dof_quad_limits;

      return dof_quad_limits;

   }


private:

   /// Initialize the limits depending on the configuration of the Device.

   DeviceDofQuadLimits()

   {

      if (Device::Allows(Backend::CUDA_MASK)) { Populate<internal::DofQuadLimits_CUDA>(); }

      else if (Device::Allows(Backend::HIP_MASK)) { Populate<internal::DofQuadLimits_HIP>(); }

      else { Populate<internal::DofQuadLimits_CPU>(); }

   }


   /// @brief Set the limits using the static members of the type @a T.

   ///

   /// @a T should be one of DofQuadLimits_CUDA, DofQuadLimits_HIP, or

   /// DofQuadLimits_CPU.

   template <typename T> void Populate()

   {

      MAX_D1D = T::MAX_D1D;

      MAX_Q1D = T::MAX_Q1D;

      HCURL_MAX_D1D = T::HCURL_MAX_D1D;

      HCURL_MAX_Q1D = T::HCURL_MAX_Q1D;

      HDIV_MAX_D1D = T::HDIV_MAX_D1D;

      HDIV_MAX_Q1D = T::HDIV_MAX_Q1D;

      MAX_INTERP_1D = T::MAX_INTERP_1D;

      MAX_DET_1D = T::MAX_DET_1D;

   }

};


// MFEM pragma macros that can be used inside MFEM_FORALL macros.

#define MFEM_PRAGMA(X) _Pragma(#X)


// MFEM_UNROLL pragma macro that can be used inside MFEM_FORALL macros.

#if defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__)

#define MFEM_UNROLL(N) MFEM_PRAGMA(unroll(N))

#else

#define MFEM_UNROLL(N)

#endif


// MFEM_GPU_FORALL: "parallel for" executed with CUDA or HIP based on the MFEM

// build-time configuration (MFEM_USE_CUDA or MFEM_USE_HIP). If neither CUDA nor

// HIP is enabled, this macro is a no-op.

#if defined(MFEM_USE_CUDA)

#define MFEM_GPU_FORALL(i, N,...) CuWrap1D(N, [=] MFEM_DEVICE      \

                                       (int i) {__VA_ARGS__})

#elif defined(MFEM_USE_HIP)

#define MFEM_GPU_FORALL(i, N,...) HipWrap1D(N, [=] MFEM_DEVICE     \

                                        (int i) {__VA_ARGS__})

#else

#define MFEM_GPU_FORALL(i, N,...) do { } while (false)

#endif


// Implementation of MFEM's "parallel for" (forall) device/host kernel

// interfaces supporting RAJA, CUDA, OpenMP, and sequential backends.


// The MFEM_FORALL wrapper

#define MFEM_FORALL(i,N,...) \

   ForallWrap<1>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})


// MFEM_FORALL with a 2D CUDA block

#define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \

   ForallWrap<2>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,BZ)


// MFEM_FORALL with a 3D CUDA block

#define MFEM_FORALL_3D(i,N,X,Y,Z,...) \

   ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z)


// MFEM_FORALL with a 3D CUDA block and grid

// With G=0, this is the same as MFEM_FORALL_3D(i,N,X,Y,Z,...)

#define MFEM_FORALL_3D_GRID(i,N,X,Y,Z,G,...) \

   ForallWrap<3>(true,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__},X,Y,Z,G)


// MFEM_FORALL that uses the basic CPU backend when use_dev is false. See for

// example the functions in vector.cpp, where we don't want to use the mfem

// device for operations on small vectors.

#define MFEM_FORALL_SWITCH(use_dev,i,N,...) \

   ForallWrap<1>(use_dev,N,[=] MFEM_HOST_DEVICE (int i) {__VA_ARGS__})


/// OpenMP backend

template <typename HBODY>


void OmpWrap(const int N, HBODY &&h_body)

{

#ifdef MFEM_USE_OPENMP

   #pragma omp parallel for

   for (int k = 0; k < N; k++)

   {

      h_body(k);

   }

#else

   MFEM_CONTRACT_VAR(N);

   MFEM_CONTRACT_VAR(h_body);

   MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");

#endif

}


/// RAJA Cuda and Hip backends

#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)

using cuda_launch_policy =

   RAJA::LaunchPolicy<RAJA::cuda_launch_t<true>>;

using cuda_teams_x =

   RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;

using cuda_threads_z =

   RAJA::LoopPolicy<RAJA::cuda_thread_z_direct>;

#endif


#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)

using hip_launch_policy =

   RAJA::LaunchPolicy<RAJA::hip_launch_t<true>>;

using hip_teams_x =

   RAJA::LoopPolicy<RAJA::hip_block_x_direct>;

using hip_threads_z =

   RAJA::LoopPolicy<RAJA::hip_thread_z_direct>;

#endif


#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)

template <const int BLOCKS = MFEM_CUDA_BLOCKS, typename DBODY>


void RajaCuWrap1D(const int N, DBODY &&d_body)

{

   //true denotes asynchronous kernel

   RAJA::forall<RAJA::cuda_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);

}


template <typename DBODY>


void RajaCuWrap2D(const int N, DBODY &&d_body,

                  const int X, const int Y, const int BZ)

{

   MFEM_VERIFY(BZ>0, "");

   const int G = (N+BZ-1)/BZ;


   using namespace RAJA;

   using RAJA::RangeSegment;


   launch<cuda_launch_policy>

   (LaunchParams(Teams(G), Threads(X, Y, BZ)),

    [=] RAJA_DEVICE (LaunchContext ctx)

   {


      loop<cuda_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)

      {


         loop<cuda_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)

         {


            const int k = n*BZ + tz;

            if (k >= N) { return; }

            d_body(k);


         });


      });


   });


   MFEM_GPU_CHECK(cudaGetLastError());

}


template <typename DBODY>


void RajaCuWrap3D(const int N, DBODY &&d_body,

                  const int X, const int Y, const int Z, const int G)

{

   const int GRID = G == 0 ? N : G;

   using namespace RAJA;

   using RAJA::RangeSegment;


   launch<cuda_launch_policy>

   (LaunchParams(Teams(GRID), Threads(X, Y, Z)),

    [=] RAJA_DEVICE (LaunchContext ctx)

   {


      loop<cuda_teams_x>(ctx, RangeSegment(0, N), d_body);


   });


   MFEM_GPU_CHECK(cudaGetLastError());

}


template <int Dim>

struct RajaCuWrap;


template <>


struct RajaCuWrap<1>

{

   template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


   static void run(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

   {

      RajaCuWrap1D<BLCK>(N, d_body);

   }


};


template <>


struct RajaCuWrap<2>

{

   template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


   static void run(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

   {

      RajaCuWrap2D(N, d_body, X, Y, Z);

   }


};


template <>


struct RajaCuWrap<3>

{

   template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


   static void run(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

   {

      RajaCuWrap3D(N, d_body, X, Y, Z, G);

   }


};


#endif


#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)

template <const int BLOCKS = MFEM_HIP_BLOCKS, typename DBODY>


void RajaHipWrap1D(const int N, DBODY &&d_body)

{

   //true denotes asynchronous kernel

   RAJA::forall<RAJA::hip_exec<BLOCKS,true>>(RAJA::RangeSegment(0,N),d_body);

}


template <typename DBODY>


void RajaHipWrap2D(const int N, DBODY &&d_body,

                   const int X, const int Y, const int BZ)

{

   MFEM_VERIFY(BZ>0, "");

   const int G = (N+BZ-1)/BZ;


   using namespace RAJA;

   using RAJA::RangeSegment;


   launch<hip_launch_policy>

   (LaunchParams(Teams(G), Threads(X, Y, BZ)),

    [=] RAJA_DEVICE (LaunchContext ctx)

   {


      loop<hip_teams_x>(ctx, RangeSegment(0, G), [&] (const int n)

      {


         loop<hip_threads_z>(ctx, RangeSegment(0, BZ), [&] (const int tz)

         {


            const int k = n*BZ + tz;

            if (k >= N) { return; }

            d_body(k);


         });


      });


   });


   MFEM_GPU_CHECK(hipGetLastError());

}


template <typename DBODY>


void RajaHipWrap3D(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

{

   const int GRID = G == 0 ? N : G;

   using namespace RAJA;

   using RAJA::RangeSegment;


   launch<hip_launch_policy>

   (LaunchParams(Teams(GRID), Threads(X, Y, Z)),

    [=] RAJA_DEVICE (LaunchContext ctx)

   {


      loop<hip_teams_x>(ctx, RangeSegment(0, N), d_body);


   });


   MFEM_GPU_CHECK(hipGetLastError());

}


template <int Dim>

struct RajaHipWrap;


template <>


struct RajaHipWrap<1>

{

   template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


   static void run(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

   {

      RajaHipWrap1D<BLCK>(N, d_body);

   }


};


template <>


struct RajaHipWrap<2>

{

   template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


   static void run(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

   {

      RajaHipWrap2D(N, d_body, X, Y, Z);

   }


};


template <>


struct RajaHipWrap<3>

{

   template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


   static void run(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

   {

      RajaHipWrap3D(N, d_body, X, Y, Z, G);

   }


};


#endif


/// RAJA OpenMP backend

#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)


template <typename HBODY>


void RajaOmpWrap(const int N, HBODY &&h_body)

{

   RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);

}


#endif


/// RAJA sequential loop backend

template <typename HBODY>


void RajaSeqWrap(const int N, HBODY &&h_body)

{

#ifdef MFEM_USE_RAJA


#if (RAJA_VERSION_MAJOR >= 2023)

   //loop_exec was marked deprecated in RAJA version 2023.06.0

   //and will be removed. We now use seq_exec.

   using raja_forall_pol = RAJA::seq_exec;

#else

   using raja_forall_pol = RAJA::loop_exec;

#endif


   RAJA::forall<raja_forall_pol>(RAJA::RangeSegment(0,N), h_body);

#else

   MFEM_CONTRACT_VAR(N);

   MFEM_CONTRACT_VAR(h_body);

   MFEM_ABORT("RAJA requested but RAJA is not enabled!");

#endif

}


/// CUDA backend

#ifdef MFEM_USE_CUDA


template <typename BODY> __global__ static

void CuKernel1D(const int N, BODY body)

{

   const int k = blockDim.x*blockIdx.x + threadIdx.x;

   if (k >= N) { return; }

   body(k);

}


template <typename BODY> __global__ static

void CuKernel2D(const int N, BODY body)

{

   const int k = blockIdx.x*blockDim.z + threadIdx.z;

   if (k >= N) { return; }

   body(k);

}


template <typename BODY> __global__ static

void CuKernel3D(const int N, BODY body)

{

   for (int k = blockIdx.x; k < N; k += gridDim.x) { body(k); }

}


template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


void CuWrap1D(const int N, DBODY &&d_body)

{

   if (N==0) { return; }

   const int GRID = (N+BLCK-1)/BLCK;

   CuKernel1D<<<GRID,BLCK>>>(N, d_body);

   MFEM_GPU_CHECK(cudaGetLastError());

}


template <typename DBODY>


void CuWrap2D(const int N, DBODY &&d_body,

              const int X, const int Y, const int BZ)

{

   if (N==0) { return; }

   MFEM_VERIFY(BZ>0, "");

   const int GRID = (N+BZ-1)/BZ;

   const dim3 BLCK(X,Y,BZ);

   CuKernel2D<<<GRID,BLCK>>>(N,d_body);

   MFEM_GPU_CHECK(cudaGetLastError());

}


template <typename DBODY>


void CuWrap3D(const int N, DBODY &&d_body,

              const int X, const int Y, const int Z, const int G)

{

   if (N==0) { return; }

   const int GRID = G == 0 ? N : G;

   const dim3 BLCK(X,Y,Z);

   CuKernel3D<<<GRID,BLCK>>>(N,d_body);

   MFEM_GPU_CHECK(cudaGetLastError());

}


template <int Dim>

struct CuWrap;


template <>


struct CuWrap<1>

{

   template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


   static void run(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

   {

      CuWrap1D<BLCK>(N, d_body);

   }


};


template <>


struct CuWrap<2>

{

   template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


   static void run(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

   {

      CuWrap2D(N, d_body, X, Y, Z);

   }


};


template <>


struct CuWrap<3>

{

   template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


   static void run(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

   {

      CuWrap3D(N, d_body, X, Y, Z, G);

   }


};


#endif // MFEM_USE_CUDA


/// HIP backend

#ifdef MFEM_USE_HIP


template <typename BODY> __global__ static

void HipKernel1D(const int N, BODY body)

{

   const int k = hipBlockDim_x*hipBlockIdx_x + hipThreadIdx_x;

   if (k >= N) { return; }

   body(k);

}


template <typename BODY> __global__ static

void HipKernel2D(const int N, BODY body)

{

   const int k = hipBlockIdx_x*hipBlockDim_z + hipThreadIdx_z;

   if (k >= N) { return; }

   body(k);

}


template <typename BODY> __global__ static

void HipKernel3D(const int N, BODY body)

{

   for (int k = hipBlockIdx_x; k < N; k += hipGridDim_x) { body(k); }

}


template <const int BLCK = MFEM_HIP_BLOCKS, typename DBODY>


void HipWrap1D(const int N, DBODY &&d_body)

{

   if (N==0) { return; }

   const int GRID = (N+BLCK-1)/BLCK;

   hipLaunchKernelGGL(HipKernel1D,GRID,BLCK,0,nullptr,N,d_body);

   MFEM_GPU_CHECK(hipGetLastError());

}


template <typename DBODY>


void HipWrap2D(const int N, DBODY &&d_body,

               const int X, const int Y, const int BZ)

{

   if (N==0) { return; }

   const int GRID = (N+BZ-1)/BZ;

   const dim3 BLCK(X,Y,BZ);

   hipLaunchKernelGGL(HipKernel2D,GRID,BLCK,0,nullptr,N,d_body);

   MFEM_GPU_CHECK(hipGetLastError());

}


template <typename DBODY>


void HipWrap3D(const int N, DBODY &&d_body,

               const int X, const int Y, const int Z, const int G)

{

   if (N==0) { return; }

   const int GRID = G == 0 ? N : G;

   const dim3 BLCK(X,Y,Z);

   hipLaunchKernelGGL(HipKernel3D,GRID,BLCK,0,nullptr,N,d_body);

   MFEM_GPU_CHECK(hipGetLastError());

}


template <int Dim>

struct HipWrap;


template <>


struct HipWrap<1>

{

   template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


   static void run(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

   {

      HipWrap1D<BLCK>(N, d_body);

   }


};


template <>


struct HipWrap<2>

{

   template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


   static void run(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

   {

      HipWrap2D(N, d_body, X, Y, Z);

   }


};


template <>


struct HipWrap<3>

{

   template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>


   static void run(const int N, DBODY &&d_body,

                   const int X, const int Y, const int Z, const int G)

   {

      HipWrap3D(N, d_body, X, Y, Z, G);

   }


};


#endif // MFEM_USE_HIP


/// The forall kernel body wrapper

template <const int DIM, typename d_lambda, typename h_lambda>


inline void ForallWrap(const bool use_dev, const int N,

                       d_lambda &&d_body, h_lambda &&h_body,

                       const int X=0, const int Y=0, const int Z=0,

                       const int G=0)

{

   MFEM_CONTRACT_VAR(X);

   MFEM_CONTRACT_VAR(Y);

   MFEM_CONTRACT_VAR(Z);

   MFEM_CONTRACT_VAR(G);

   MFEM_CONTRACT_VAR(d_body);

   if (!use_dev) { goto backend_cpu; }


#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)

   // If Backend::RAJA_CUDA is allowed, use it

   if (Device::Allows(Backend::RAJA_CUDA))

   {

      return RajaCuWrap<DIM>::run(N, d_body, X, Y, Z, G);

   }

#endif


#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_HIP)

   // If Backend::RAJA_HIP is allowed, use it

   if (Device::Allows(Backend::RAJA_HIP))

   {

      return RajaHipWrap<DIM>::run(N, d_body, X, Y, Z, G);

   }

#endif


#ifdef MFEM_USE_CUDA

   // If Backend::CUDA is allowed, use it

   if (Device::Allows(Backend::CUDA))

   {

      return CuWrap<DIM>::run(N, d_body, X, Y, Z, G);

   }

#endif


#ifdef MFEM_USE_HIP

   // If Backend::HIP is allowed, use it

   if (Device::Allows(Backend::HIP))

   {

      return HipWrap<DIM>::run(N, d_body, X, Y, Z, G);

   }

#endif


   // If Backend::DEBUG_DEVICE is allowed, use it

   if (Device::Allows(Backend::DEBUG_DEVICE)) { goto backend_cpu; }


#if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)

   // If Backend::RAJA_OMP is allowed, use it

   if (Device::Allows(Backend::RAJA_OMP)) { return RajaOmpWrap(N, h_body); }

#endif


#ifdef MFEM_USE_OPENMP

   // If Backend::OMP is allowed, use it

   if (Device::Allows(Backend::OMP)) { return OmpWrap(N, h_body); }

#endif


#ifdef MFEM_USE_RAJA

   // If Backend::RAJA_CPU is allowed, use it

   if (Device::Allows(Backend::RAJA_CPU)) { return RajaSeqWrap(N, h_body); }

#endif


backend_cpu:

   // Handle Backend::CPU. This is also a fallback for any allowed backends not

   // handled above, e.g. OCCA_CPU with configuration 'occa-cpu,cpu', or

   // OCCA_OMP with configuration 'occa-omp,cpu'.

   for (int k = 0; k < N; k++) { h_body(k); }

}


template <const int DIM, typename lambda>


inline void ForallWrap(const bool use_dev, const int N, lambda &&body,

                       const int X=0, const int Y=0, const int Z=0,

                       const int G=0)

{

   ForallWrap<DIM>(use_dev, N, body, body, X, Y, Z, G);

}


template<typename lambda>

inline void forall(int N, lambda &&body) { ForallWrap<1>(true, N, body); }


template<typename lambda>


inline void forall_switch(bool use_dev, int N, lambda &&body)

{

   ForallWrap<1>(use_dev, N, body);

}


template<typename lambda>


inline void forall_2D(int N, int X, int Y, lambda &&body)

{

   ForallWrap<2>(true, N, body, X, Y, 1);

}


template<typename lambda>


inline void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)

{

   ForallWrap<2>(true, N, body, X, Y, BZ);

}


template<typename lambda>


inline void forall_3D(int N, int X, int Y, int Z, lambda &&body)

{

   ForallWrap<3>(true, N, body, X, Y, Z, 0);

}


template<typename lambda>


inline void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)

{

   ForallWrap<3>(true, N, body, X, Y, Z, G);

}


#ifdef MFEM_USE_MPI


// Function mfem::hypre_forall_cpu() similar to mfem::forall, but it always

// executes on the CPU using sequential or OpenMP-parallel execution based on

// the hypre build time configuration.

template<typename lambda>


inline void hypre_forall_cpu(int N, lambda &&body)

{

#ifdef HYPRE_USING_OPENMP

   #pragma omp parallel for HYPRE_SMP_SCHEDULE

#endif

   for (int i = 0; i < N; i++) { body(i); }

}


// Function mfem::hypre_forall_gpu() similar to mfem::forall, but it always

// executes on the GPU device that hypre was configured with at build time.

#if defined(HYPRE_USING_GPU)

template<typename lambda>


inline void hypre_forall_gpu(int N, lambda &&body)

{

#if defined(HYPRE_USING_CUDA)

   CuWrap1D(N, body);

#elif defined(HYPRE_USING_HIP)

   HipWrap1D(N, body);

#else

#error Unknown HYPRE GPU backend!

#endif

}


#endif


// Function mfem::hypre_forall() similar to mfem::forall, but it executes on the

// device, CPU or GPU, that hypre was configured with at build time (when the

// HYPRE version is < 2.31.0) or at runtime (when HYPRE was configured with GPU

// support at build time and HYPRE's version is >= 2.31.0). This selection is

// generally independent of what device was selected in MFEM's runtime

// configuration.

template<typename lambda>


inline void hypre_forall(int N, lambda &&body)

{

#if !defined(HYPRE_USING_GPU)

   hypre_forall_cpu(N, body);

#elif MFEM_HYPRE_VERSION < 23100

   hypre_forall_gpu(N, body);

#else // HYPRE_USING_GPU is defined and MFEM_HYPRE_VERSION >= 23100

   if (!HypreUsingGPU())

   {

      hypre_forall_cpu(N, body);

   }

   else

   {

      hypre_forall_gpu(N, body);

   }

#endif

}


// Return the most general MemoryClass that can be used with mfem::hypre_forall

// kernels. The returned MemoryClass is the same as the one returned by

// GerHypreMemoryClass() except when hypre is configured to use UVM, in which

// case this function returns MemoryClass::HOST or MemoryClass::DEVICE depending

// on the result of HypreUsingGPU().


inline MemoryClass GetHypreForallMemoryClass()

{

   return HypreUsingGPU() ? MemoryClass::DEVICE : MemoryClass::HOST;

}


#endif // MFEM_USE_MPI


namespace internal

{

/**

 @brief Device portion of a reduction over a 1D sequence [0, N)

 @tparam B Reduction body. Must be callable with the signature void(int i, value_type&

 v), where i is the index to evaluate and v is the value to update.

 @tparam R Reducer capable of combining values of type value_type. See reducers.hpp for

 pre-defined reducers.

 */

template<class B, class R> struct reduction_kernel

{

   /// value type body and reducer operate on.

   using value_type = typename R::value_type;

   /// workspace for the intermediate reduction results

   mutable value_type *work;

   B body;

   R reducer;

   /// Length of sequence to reduce over.

   int N;

   /// How many items is each thread responsible for during the serial phase

   int items_per_thread;


   constexpr static MFEM_HOST_DEVICE int max_blocksize() { return 256; }


   /// helper for computing the reduction block size

   static int block_log2(unsigned N)

   {

#if defined(__GNUC__) or defined(__clang__)

      return N ? (sizeof(unsigned) * 8 - __builtin_clz(N)) : 0;

#elif defined(_MSC_VER)

      return sizeof(unsigned) * 8 - __lzclz(N);

#else

      int res = 0;

      while (N)

      {

         N >>= 1;

         ++res;

      }

      return res;

#endif

   }


   MFEM_HOST_DEVICE void operator()(int work_idx) const

   {

      MFEM_SHARED value_type buffer[max_blocksize()];

      reducer.SetInitialValue(buffer[MFEM_THREAD_ID(x)]);

      // serial part

      for (int idx = 0; idx < items_per_thread; ++idx)

      {

         int i = MFEM_THREAD_ID(x) +

                 (idx + work_idx * items_per_thread) * MFEM_THREAD_SIZE(x);

         if (i < N)

         {

            body(i, buffer[MFEM_THREAD_ID(x)]);

         }

         else

         {

            break;

         }

      }

      // binary tree reduction

      for (int i = (MFEM_THREAD_SIZE(x) >> 1); i > 0; i >>= 1)

      {

         MFEM_SYNC_THREAD;

         if (MFEM_THREAD_ID(x) < i)

         {

            reducer.Join(buffer[MFEM_THREAD_ID(x)], buffer[MFEM_THREAD_ID(x) + i]);

         }

      }

      if (MFEM_THREAD_ID(x) == 0)

      {

         work[work_idx] = buffer[0];

      }

   }

};

}


/**

 @brief Performs a 1D reduction on the range [0,N).

 @a res initial value and where the result will be written.

 @a body reduction function body.

 @a reducer helper for joining two reduced values.

 @a use_dev true to perform the reduction on the device, if possible.

 @a workspace temporary workspace used for device reductions. May be resized to

 a larger capacity as needed. Preferably should have MemoryType::MANAGED or

 MemoryType::HOST_PINNED. TODO: replace with internal temporary workspace

 vectors once that's added to the memory manager.

 @tparam T value_type to operate on

 */

template <class T, class B, class R>


void reduce(int N, T &res, B &&body, const R &reducer, bool use_dev,

            Array<T> &workspace)

{

   if (N == 0)

   {

      return;

   }


#if defined(MFEM_USE_HIP) || defined(MFEM_USE_CUDA)

   if (use_dev &&

       mfem::Device::Allows(Backend::CUDA | Backend::HIP | Backend::RAJA_CUDA |

                            Backend::RAJA_HIP))

   {

      using red_type = internal::reduction_kernel<typename std::decay<B>::type,

            typename std::decay<R>::type>;

      // max block size is 256, but can be smaller

      int block_size = std::min<int>(red_type::max_blocksize(),

                                     1ll << red_type::block_log2(N));


      int num_mp = Device::NumMultiprocessors(Device::GetId());

#if defined(MFEM_USE_CUDA)

      // good value of mp_sat found experimentally on Lassen

      constexpr int mp_sat = 8;

#elif defined(MFEM_USE_HIP)

      // good value of mp_sat found experimentally on Tuolumne

      constexpr int mp_sat = 4;

#else

      num_mp = 1;

      constexpr int mp_sat = 1;

#endif

      // determine how many items each thread should sum during the serial

      // portion

      int nblocks = std::min(mp_sat * num_mp, (N + block_size - 1) / block_size);

      int items_per_thread =

         (N + block_size * nblocks - 1) / (block_size * nblocks);


      red_type red{nullptr, std::forward<B>(body), reducer, N, items_per_thread};

      // allocate res to fit block_size entries

      auto mt = workspace.GetMemory().GetMemoryType();

      if (mt != MemoryType::HOST_PINNED && mt != MemoryType::MANAGED)

      {

         mt = MemoryType::HOST_PINNED;

      }

      workspace.SetSize(nblocks, mt);

      auto work = workspace.HostWrite();

      red.work = work;

      forall_2D(nblocks, block_size, 1, std::move(red));

      // wait for results

      MFEM_DEVICE_SYNC;

      for (int i = 0; i < nblocks; ++i)

      {

         reducer.Join(res, work[i]);

      }

      return;

   }

#endif


   for (int i = 0; i < N; ++i)

   {

      body(i, res);

   }

}


} // namespace mfem


#endif // MFEM_FORALL_HPP

annotation.hpp

array.hpp

backends.hpp

mfem::Array
Definition array.hpp:47

mfem::Array::GetMemory
Memory< T > & GetMemory()
Return a reference to the Memory object used by the Array.
Definition array.hpp:126

mfem::Array::SetSize
void SetSize(int nsize)
Change the logical size of the array, keep existing entries.
Definition array.hpp:758

mfem::Array::HostWrite
T * HostWrite()
Shortcut for mfem::Write(a.GetMemory(), a.Size(), false).
Definition array.hpp:349

mfem::Device::NumMultiprocessors
static int NumMultiprocessors()
Same as NumMultiprocessors(int), for the currently active device.
Definition device.cpp:714

mfem::Device::Allows
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition device.hpp:259

mfem::Device::GetId
static int GetId()
Get the device id of the configured device.
Definition device.hpp:253

config.hpp

device.hpp

dtensor.hpp

error.hpp

mem_manager.hpp

mfem
Definition CodeDocumentation.dox:1

mfem::cuda_launch_policy
RAJA::LaunchPolicy< RAJA::cuda_launch_t< true > > cuda_launch_policy
RAJA Cuda and Hip backends.
Definition forall.hpp:230

mfem::RajaOmpWrap
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
Definition forall.hpp:451

mfem::hip_launch_policy
RAJA::LaunchPolicy< RAJA::hip_launch_t< true > > hip_launch_policy
Definition forall.hpp:239

mfem::RajaSeqWrap
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Definition forall.hpp:461

mfem::GetHypreForallMemoryClass
MemoryClass GetHypreForallMemoryClass()
Definition forall.hpp:845

mfem::RajaHipWrap2D
void RajaHipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:356

mfem::hypre_forall_cpu
void hypre_forall_cpu(int N, lambda &&body)
Definition forall.hpp:791

mfem::reduce
void reduce(int N, T &res, B &&body, const R &reducer, bool use_dev, Array< T > &workspace)
Performs a 1D reduction on the range [0,N). res initial value and where the result will be written....
Definition forall.hpp:942

mfem::RajaCuWrap2D
void RajaCuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:256

mfem::cuda_threads_z
RAJA::LoopPolicy< RAJA::cuda_thread_z_direct > cuda_threads_z
Definition forall.hpp:234

mfem::CuWrap1D
void CuWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:508

mfem::MemoryClass
MemoryClass
Memory classes identify sets of memory types.
Definition mem_manager.hpp:81

mfem::MemoryClass::HOST
@ HOST

mfem::MemoryClass::DEVICE
@ DEVICE

mfem::RajaCuWrap3D
void RajaCuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:290

mfem::RajaHipWrap1D
void RajaHipWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:349

mfem::forall_2D_batch
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
Definition forall.hpp:768

mfem::CuWrap2D
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:517

mfem::hypre_forall_gpu
void hypre_forall_gpu(int N, lambda &&body)
Definition forall.hpp:803

mfem::DofQuadLimits
internal::DofQuadLimits_CUDA DofQuadLimits
Maximum number of 1D DOFs or quadrature points for the architecture currently being compiled for (use...
Definition forall.hpp:100

mfem::ForallWrap
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
Definition forall.hpp:675

mfem::forall_2D
void forall_2D(int N, int X, int Y, lambda &&body)
Definition forall.hpp:762

mfem::HipWrap3D
void HipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:624

mfem::hip_threads_z
RAJA::LoopPolicy< RAJA::hip_thread_z_direct > hip_threads_z
Definition forall.hpp:243

mfem::CuWrap3D
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:529

mfem::HipWrap2D
void HipWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition forall.hpp:613

mfem::forall_3D
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition forall.hpp:774

mfem::hypre_forall
void hypre_forall(int N, lambda &&body)
Definition forall.hpp:822

mfem::OmpWrap
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.
Definition forall.hpp:212

mfem::HypreUsingGPU
bool HypreUsingGPU()
Return true if HYPRE is configured to use GPU.
Definition mem_manager.hpp:906

mfem::forall_3D_grid
void forall_3D_grid(int N, int X, int Y, int Z, int G, lambda &&body)
Definition forall.hpp:780

mfem::hip_teams_x
RAJA::LoopPolicy< RAJA::hip_block_x_direct > hip_teams_x
Definition forall.hpp:241

mfem::RajaCuWrap1D
void RajaCuWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:249

mfem::MemoryType::HOST_PINNED
@ HOST_PINNED
Host memory: pinned (page-locked)

mfem::MemoryType::MANAGED
@ MANAGED

mfem::HipWrap1D
void HipWrap1D(const int N, DBODY &&d_body)
Definition forall.hpp:604

mfem::forall
void forall(int N, lambda &&body)
Definition forall.hpp:753

mfem::forall_switch
void forall_switch(bool use_dev, int N, lambda &&body)
Definition forall.hpp:756

mfem::cuda_teams_x
RAJA::LoopPolicy< RAJA::cuda_block_x_direct > cuda_teams_x
Definition forall.hpp:232

mfem::RajaHipWrap3D
void RajaHipWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:390

ctx
struct s_NavierContext ctx

reducers.hpp

mfem::Backend::RAJA_OMP
@ RAJA_OMP
[host] RAJA OpenMP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_OPENMP = YES.
Definition device.hpp:46

mfem::Backend::RAJA_CUDA
@ RAJA_CUDA
[device] RAJA CUDA backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_CUDA = YES.
Definition device.hpp:49

mfem::Backend::DEBUG_DEVICE
@ DEBUG_DEVICE
[device] Debug backend: host memory is READ/WRITE protected while a device is in use....
Definition device.hpp:76

mfem::Backend::RAJA_CPU
@ RAJA_CPU
[host] RAJA CPU backend: sequential execution on each MPI rank. Enabled when MFEM_USE_RAJA = YES.
Definition device.hpp:43

mfem::Backend::OMP
@ OMP
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
Definition device.hpp:36

mfem::Backend::HIP
@ HIP
[device] HIP backend. Enabled when MFEM_USE_HIP = YES.
Definition device.hpp:40

mfem::Backend::RAJA_HIP
@ RAJA_HIP
[device] RAJA HIP backend. Enabled when MFEM_USE_RAJA = YES and MFEM_USE_HIP = YES.
Definition device.hpp:52

mfem::Backend::CUDA
@ CUDA
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
Definition device.hpp:38

mfem::Backend::HIP_MASK
@ HIP_MASK
Biwise-OR of all HIP backends.
Definition device.hpp:91

mfem::Backend::CUDA_MASK
@ CUDA_MASK
Biwise-OR of all CUDA backends.
Definition device.hpp:89

mfem::CuWrap< 1 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:546

mfem::CuWrap< 2 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:557

mfem::CuWrap< 3 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:568

mfem::CuWrap
Definition forall.hpp:540

mfem::DeviceDofQuadLimits
Maximum number of 1D DOFs or quadrature points for the current runtime configuration of the Device (u...
Definition forall.hpp:117

mfem::DeviceDofQuadLimits::Get
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition forall.hpp:128

mfem::DeviceDofQuadLimits::HCURL_MAX_D1D
int HCURL_MAX_D1D
Maximum number of 1D nodal points for H(curl).
Definition forall.hpp:120

mfem::DeviceDofQuadLimits::HCURL_MAX_Q1D
int HCURL_MAX_Q1D
Maximum number of 1D quadrature points for H(curl).
Definition forall.hpp:121

mfem::DeviceDofQuadLimits::HDIV_MAX_Q1D
int HDIV_MAX_Q1D
Maximum number of 1D quadrature points for H(div).
Definition forall.hpp:123

mfem::DeviceDofQuadLimits::MAX_INTERP_1D
int MAX_INTERP_1D
Maximum number of points for use in QuadratureInterpolator.
Definition forall.hpp:124

mfem::DeviceDofQuadLimits::HDIV_MAX_D1D
int HDIV_MAX_D1D
Maximum number of 1D nodal points for H(div).
Definition forall.hpp:122

mfem::DeviceDofQuadLimits::MAX_DET_1D
int MAX_DET_1D
Maximum number of points for determinant computation in QuadratureInterpolator.
Definition forall.hpp:125

mfem::DeviceDofQuadLimits::MAX_D1D
int MAX_D1D
Maximum number of 1D nodal points.
Definition forall.hpp:118

mfem::DeviceDofQuadLimits::MAX_Q1D
int MAX_Q1D
Maximum number of 1D quadrature points.
Definition forall.hpp:119

mfem::HipWrap< 1 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:641

mfem::HipWrap< 2 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:652

mfem::HipWrap< 3 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:663

mfem::HipWrap
Definition forall.hpp:635

mfem::RajaCuWrap< 1 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:316

mfem::RajaCuWrap< 2 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:327

mfem::RajaCuWrap< 3 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:338

mfem::RajaCuWrap
Definition forall.hpp:310

mfem::RajaHipWrap< 1 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:416

mfem::RajaHipWrap< 2 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:427

mfem::RajaHipWrap< 3 >::run
static void run(const int N, DBODY &&d_body, const int X, const int Y, const int Z, const int G)
Definition forall.hpp:438

mfem::RajaHipWrap
Definition forall.hpp:410