MFEM  v4.0
Finite element discretization library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Pages
forall.hpp
Go to the documentation of this file.
1 // Copyright (c) 2010, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-443211. All Rights
3 // reserved. See file COPYRIGHT for details.
4 //
5 // This file is part of the MFEM library. For more information and source code
6 // availability see http://mfem.org.
7 //
8 // MFEM is free software; you can redistribute it and/or modify it under the
9 // terms of the GNU Lesser General Public License (as published by the Free
10 // Software Foundation) version 2.1 dated February 1999.
11 
12 #ifndef MFEM_FORALL_HPP
13 #define MFEM_FORALL_HPP
14 
15 #include "../config/config.hpp"
16 #include "error.hpp"
17 #include "cuda.hpp"
18 #include "occa.hpp"
19 #include "device.hpp"
20 #include "mem_manager.hpp"
21 #include "../linalg/dtensor.hpp"
22 
23 #ifdef MFEM_USE_RAJA
24 #include "RAJA/RAJA.hpp"
25 #if defined(RAJA_ENABLE_CUDA) && !defined(MFEM_USE_CUDA)
26 #error When RAJA is built with CUDA, MFEM_USE_CUDA=YES is required
27 #endif
28 #endif
29 
30 namespace mfem
31 {
32 
33 // Maximum size of dofs and quads in 1D.
34 const int MAX_D1D = 16;
35 const int MAX_Q1D = 16;
36 
37 // Implementation of MFEM's "parallel for" (forall) device/host kernel
38 // interfaces supporting RAJA, CUDA, OpenMP, and sequential backends.
39 
40 // The MFEM_FORALL wrapper
41 #define MFEM_FORALL(i,N,...) \
42  ForallWrap<1>(true,N, \
43  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
44  [&] (int i) {__VA_ARGS__})
45 
46 // MFEM_FORALL with a 2D CUDA block
47 #define MFEM_FORALL_2D(i,N,X,Y,BZ,...) \
48  ForallWrap<2>(true,N, \
49  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
50  [&] (int i) {__VA_ARGS__}, \
51  X,Y,BZ)
52 
53 // MFEM_FORALL with a 3D CUDA block
54 #define MFEM_FORALL_3D(i,N,X,Y,Z,...) \
55  ForallWrap<3>(true,N, \
56  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
57  [&] (int i) {__VA_ARGS__}, \
58  X,Y,Z)
59 
60 // MFEM_FORALL that uses the basic CPU backend when use_dev is false. See for
61 // example the functions in vector.cpp, where we don't want to use the mfem
62 // device for operations on small vectors.
63 #define MFEM_FORALL_SWITCH(use_dev,i,N,...) \
64  ForallWrap<1>(use_dev,N, \
65  [=] MFEM_DEVICE (int i) {__VA_ARGS__}, \
66  [&] (int i) {__VA_ARGS__})
67 
68 
69 /// OpenMP backend
70 template <typename HBODY>
71 void OmpWrap(const int N, HBODY &&h_body)
72 {
73 #ifdef MFEM_USE_OPENMP
74  #pragma omp parallel for
75  for (int k = 0; k < N; k++)
76  {
77  h_body(k);
78  }
79 #else
80  MFEM_ABORT("OpenMP requested for MFEM but OpenMP is not enabled!");
81 #endif
82 }
83 
84 
85 /// RAJA Cuda backend
86 template <int BLOCKS, typename DBODY>
87 void RajaCudaWrap(const int N, DBODY &&d_body)
88 {
89 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
90  RAJA::forall<RAJA::cuda_exec<BLOCKS>>(RAJA::RangeSegment(0,N),d_body);
91 #else
92  MFEM_ABORT("RAJA::Cuda requested but RAJA::Cuda is not enabled!");
93 #endif
94 }
95 
96 
97 /// RAJA OpenMP backend
98 template <typename HBODY>
99 void RajaOmpWrap(const int N, HBODY &&h_body)
100 {
101 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
102  RAJA::forall<RAJA::omp_parallel_for_exec>(RAJA::RangeSegment(0,N), h_body);
103 #else
104  MFEM_ABORT("RAJA::OpenMP requested but RAJA::OpenMP is not enabled!");
105 #endif
106 }
107 
108 
109 /// RAJA sequential loop backend
110 template <typename HBODY>
111 void RajaSeqWrap(const int N, HBODY &&h_body)
112 {
113 #ifdef MFEM_USE_RAJA
114  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0,N), h_body);
115 #else
116  MFEM_ABORT("RAJA requested but RAJA is not enabled!");
117 #endif
118 }
119 
120 
121 /// CUDA backend
122 #ifdef MFEM_USE_CUDA
123 
124 template <typename BODY> __global__ static
125 void CuKernel1D(const int N, BODY body)
126 {
127  const int k = blockDim.x*blockIdx.x + threadIdx.x;
128  if (k >= N) { return; }
129  body(k);
130 }
131 
132 template <typename BODY> __global__ static
133 void CuKernel2D(const int N, BODY body, const int BZ)
134 {
135  const int k = blockIdx.x*BZ + threadIdx.z;
136  if (k >= N) { return; }
137  body(k);
138 }
139 
140 template <typename BODY> __global__ static
141 void CuKernel3D(const int N, BODY body)
142 {
143  const int k = blockIdx.x;
144  if (k >= N) { return; }
145  body(k);
146 }
147 
148 template <const int BLCK = MFEM_CUDA_BLOCKS, typename DBODY>
149 void CuWrap1D(const int N, DBODY &&d_body)
150 {
151  if (N==0) { return; }
152  const int GRID = (N+BLCK-1)/BLCK;
153  CuKernel1D<<<GRID,BLCK>>>(N, d_body);
154  MFEM_CUDA_CHECK(cudaGetLastError());
155 }
156 
157 template <typename DBODY>
158 void CuWrap2D(const int N, DBODY &&d_body,
159  const int X, const int Y, const int BZ)
160 {
161  if (N==0) { return; }
162  const int GRID = (N+BZ-1)/BZ;
163  const dim3 BLCK(X,Y,BZ);
164  CuKernel2D<<<GRID,BLCK>>>(N,d_body,BZ);
165  MFEM_CUDA_CHECK(cudaGetLastError());
166 }
167 
168 template <typename DBODY>
169 void CuWrap3D(const int N, DBODY &&d_body,
170  const int X, const int Y, const int Z)
171 {
172  if (N==0) { return; }
173  const int GRID = N;
174  const dim3 BLCK(X,Y,Z);
175  CuKernel3D<<<GRID,BLCK>>>(N,d_body);
176  MFEM_CUDA_CHECK(cudaGetLastError());
177 }
178 
179 #endif // MFEM_USE_CUDA
180 
181 
182 /// The forall kernel body wrapper
183 template <const int DIM, typename DBODY, typename HBODY>
184 inline void ForallWrap(const bool use_dev, const int N,
185  DBODY &&d_body, HBODY &&h_body,
186  const int X=0, const int Y=0, const int Z=0)
187 {
188  if (!use_dev) { goto backend_cpu; }
189 
190 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_CUDA)
191  // Handle all allowed CUDA backends except Backend::CUDA
193  { return RajaCudaWrap<MFEM_CUDA_BLOCKS>(N, d_body); }
194 #endif
195 
196 #ifdef MFEM_USE_CUDA
197  // Handle all allowed CUDA backends
198  if (DIM == 1 && Device::Allows(Backend::CUDA_MASK))
199  { return CuWrap1D(N, d_body); }
200 
201  if (DIM == 2 && Device::Allows(Backend::CUDA_MASK))
202  { return CuWrap2D(N, d_body, X, Y, Z); }
203 
204  if (DIM == 3 && Device::Allows(Backend::CUDA_MASK))
205  { return CuWrap3D(N, d_body, X, Y, Z); }
206 #endif
207 
208 #if defined(MFEM_USE_RAJA) && defined(RAJA_ENABLE_OPENMP)
209  // Handle all allowed OpenMP backends except Backend::OMP
211  { return RajaOmpWrap(N, h_body); }
212 #endif
213 
214 #ifdef MFEM_USE_OPENMP
215  // Handle all allowed OpenMP backends
216  if (Device::Allows(Backend::OMP_MASK)) { return OmpWrap(N, h_body); }
217 #endif
218 
219 #ifdef MFEM_USE_RAJA
220  // Handle all allowed CPU backends except Backend::CPU
222  { return RajaSeqWrap(N, h_body); }
223 #endif
224 
225 backend_cpu:
226  // Handle Backend::CPU. This is also a fallback for any allowed backends not
227  // handled above, e.g. OCCA_CPU with configuration 'occa-cpu,cpu', or
228  // OCCA_OMP with configuration 'occa-omp,cpu'.
229  for (int k = 0; k < N; k++) { h_body(k); }
230 }
231 
232 } // namespace mfem
233 
234 #endif // MFEM_FORALL_HPP
[host] OpenMP backend. Enabled when MFEM_USE_OPENMP = YES.
Definition: device.hpp:35
void RajaCudaWrap(const int N, DBODY &&d_body)
RAJA Cuda backend.
Definition: forall.hpp:87
void RajaSeqWrap(const int N, HBODY &&h_body)
RAJA sequential loop backend.
Definition: forall.hpp:111
void RajaOmpWrap(const int N, HBODY &&h_body)
RAJA OpenMP backend.
Definition: forall.hpp:99
void CuWrap2D(const int N, DBODY &&d_body, const int X, const int Y, const int BZ)
Definition: forall.hpp:158
void CuWrap3D(const int N, DBODY &&d_body, const int X, const int Y, const int Z)
Definition: forall.hpp:169
const int MAX_Q1D
Definition: forall.hpp:35
Biwise-OR of all OpenMP backends.
Definition: device.hpp:69
[host] Default CPU backend: sequential execution on each MPI rank.
Definition: device.hpp:33
Biwise-OR of all CUDA backends.
Definition: device.hpp:67
Biwise-OR of all CPU backends.
Definition: device.hpp:65
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition: device.hpp:204
void CuWrap1D(const int N, DBODY &&d_body)
Definition: forall.hpp:149
void ForallWrap(const bool use_dev, const int N, DBODY &&d_body, HBODY &&h_body, const int X=0, const int Y=0, const int Z=0)
The forall kernel body wrapper.
Definition: forall.hpp:184
const int MAX_D1D
Definition: forall.hpp:34
[device] CUDA backend. Enabled when MFEM_USE_CUDA = YES.
Definition: device.hpp:37
void OmpWrap(const int N, HBODY &&h_body)
OpenMP backend.
Definition: forall.hpp:71