24namespace KernelHelpers
29#if defined(__CUDA_ARCH__)
30#define MFEM_SHARED_3D_BLOCK_TENSOR(name,T,bx,by,bz,...)\
31MFEM_SHARED tensor<T,bx,by,bz,__VA_ARGS__> name;\
32name(threadIdx.x, threadIdx.y, threadIdx.z) = tensor<T,__VA_ARGS__> {};
34#define MFEM_SHARED_3D_BLOCK_TENSOR(name,...) tensor<__VA_ARGS__> name {};
48 MFEM_VERIFY(d1d <= q1d,
49 "There should be more or equal quadrature points "
52 "Maximum number of degrees of freedom in 1D reached."
53 "This number can be increased globally in general/forall.hpp if "
54 "device memory allows.");
56 "Maximum quadrature points 1D reached."
57 "This number can be increased globally in "
58 "general/forall.hpp if device memory allows.");
82template <
int dim,
int d1d,
int q1d>
83static inline MFEM_HOST_DEVICE
void
84CalcGrad(
const tensor<real_t, q1d, d1d> &B,
85 const tensor<real_t, q1d, d1d> &G,
86 tensor<real_t,2,3,q1d,q1d,q1d> &smem,
88 tensor<real_t, q1d, q1d, q1d, dim, dim> &dUdxi)
90 for (
int c = 0; c <
dim; ++c)
92 MFEM_FOREACH_THREAD(dz,z,d1d)
94 MFEM_FOREACH_THREAD(dy,y,d1d)
96 MFEM_FOREACH_THREAD(dx,x,d1d)
98 smem(0,0,dx,dy,dz) = U(dx,dy,dz,c);
103 MFEM_FOREACH_THREAD(dz,z,d1d)
105 MFEM_FOREACH_THREAD(dy,y,d1d)
107 MFEM_FOREACH_THREAD(qx,x,q1d)
111 for (
int dx = 0; dx < d1d; ++dx)
113 const real_t input = smem(0,0,dx,dy,dz);
114 u += input * B(qx,dx);
115 v += input * G(qx,dx);
117 smem(0,1,dz,dy,qx) =
u;
118 smem(0,2,dz,dy,qx) = v;
123 MFEM_FOREACH_THREAD(dz,z,d1d)
125 MFEM_FOREACH_THREAD(qy,y,q1d)
127 MFEM_FOREACH_THREAD(qx,x,q1d)
132 for (
int dy = 0; dy < d1d; ++dy)
134 u += smem(0,2,dz,dy,qx) * B(qy,dy);
135 v += smem(0,1,dz,dy,qx) * G(qy,dy);
136 w += smem(0,1,dz,dy,qx) * B(qy,dy);
138 smem(1,0,dz,qy,qx) =
u;
139 smem(1,1,dz,qy,qx) = v;
140 smem(1,2,dz,qy,qx) = w;
145 MFEM_FOREACH_THREAD(qz,z,q1d)
147 MFEM_FOREACH_THREAD(qy,y,q1d)
149 MFEM_FOREACH_THREAD(qx,x,q1d)
154 for (
int dz = 0; dz < d1d; ++dz)
156 u += smem(1,0,dz,qy,qx) * B(qz,dz);
157 v += smem(1,1,dz,qy,qx) * B(qz,dz);
158 w += smem(1,2,dz,qy,qx) * G(qz,dz);
160 dUdxi(qz,qy,qx,c,0) +=
u;
161 dUdxi(qz,qy,qx,c,1) += v;
162 dUdxi(qz,qy,qx,c,2) += w;
188template <
int dim,
int d1d,
int q1d>
189static inline MFEM_HOST_DEVICE
void
190CalcGradTSum(
const tensor<real_t, q1d, d1d> &B,
191 const tensor<real_t, q1d, d1d> &G,
192 tensor<real_t, 2, 3, q1d, q1d, q1d> &smem,
193 const tensor<real_t, q1d, q1d, q1d, dim, dim> &U,
194 DeviceTensor<4, real_t> &F)
196 for (
int c = 0; c <
dim; ++c)
198 MFEM_FOREACH_THREAD(qz, z, q1d)
200 MFEM_FOREACH_THREAD(qy, y, q1d)
202 MFEM_FOREACH_THREAD(dx, x, d1d)
204 real_t u = 0.0, v = 0.0, w = 0.0;
205 for (
int qx = 0; qx < q1d; ++qx)
207 u += U(qx, qy, qz, 0, c) * G(qx, dx);
208 v += U(qx, qy, qz, 1, c) * B(qx, dx);
209 w += U(qx, qy, qz, 2, c) * B(qx, dx);
211 smem(0, 0, qz, qy, dx) =
u;
212 smem(0, 1, qz, qy, dx) = v;
213 smem(0, 2, qz, qy, dx) = w;
218 MFEM_FOREACH_THREAD(qz, z, q1d)
220 MFEM_FOREACH_THREAD(dy, y, d1d)
222 MFEM_FOREACH_THREAD(dx, x, d1d)
224 real_t u = 0.0, v = 0.0, w = 0.0;
225 for (
int qy = 0; qy < q1d; ++qy)
227 u += smem(0, 0, qz, qy, dx) * B(qy, dy);
228 v += smem(0, 1, qz, qy, dx) * G(qy, dy);
229 w += smem(0, 2, qz, qy, dx) * B(qy, dy);
231 smem(1, 0, qz, dy, dx) =
u;
232 smem(1, 1, qz, dy, dx) = v;
233 smem(1, 2, qz, dy, dx) = w;
238 MFEM_FOREACH_THREAD(dz, z, d1d)
240 MFEM_FOREACH_THREAD(dy, y, d1d)
242 MFEM_FOREACH_THREAD(dx, x, d1d)
244 real_t u = 0.0, v = 0.0, w = 0.0;
245 for (
int qz = 0; qz < q1d; ++qz)
247 u += smem(1, 0, qz, dy, dx) * B(qz, dz);
248 v += smem(1, 1, qz, dy, dx) * B(qz, dz);
249 w += smem(1, 2, qz, dy, dx) * G(qz, dz);
252 F(dx, dy, dz, c) += sum;
281template <
int dim,
int d1d,
int q1d>
282static inline MFEM_HOST_DEVICE tensor<real_t, d1d, d1d, d1d, dim>
283GradAllShapeFunctions(
int qx,
int qy,
int qz,
284 const tensor<real_t, q1d, d1d> &B,
285 const tensor<real_t, q1d, d1d> &G,
286 const tensor<real_t, dim, dim> &invJ)
288 tensor<real_t, d1d, d1d, d1d, dim> dphi_dx;
292 for (
int dx = 0; dx < d1d; dx++)
294 for (
int dy = 0; dy < d1d; dy++)
296 for (
int dz = 0; dz < d1d; dz++)
300 tensor<real_t, dim> {G(qx, dx) * B(qy, dy) * B(qz, dz),
301 B(qx, dx) * G(qy, dy) * B(qz, dz),
302 B(qx, dx) * B(qy, dy) * G(qz, dz)