21namespace KernelHelpers
28#if defined(__CUDA_ARCH__)
29#define MFEM_SHARED_3D_BLOCK_TENSOR(name,T,bx,by,bz,...)\
30MFEM_SHARED tensor<T,bx,by,bz,__VA_ARGS__> name;\
31name(threadIdx.x, threadIdx.y, threadIdx.z) = tensor<T,__VA_ARGS__> {};
33#define MFEM_SHARED_3D_BLOCK_TENSOR(name,...) tensor<__VA_ARGS__> name {};
47 MFEM_VERIFY(d1d <= q1d,
48 "There should be more or equal quadrature points "
51 "Maximum number of degrees of freedom in 1D reached."
52 "This number can be increased globally in general/forall.hpp if "
53 "device memory allows.");
55 "Maximum quadrature points 1D reached."
56 "This number can be increased globally in "
57 "general/forall.hpp if device memory allows.");
81template <
int dim,
int d1d,
int q1d>
82static inline MFEM_HOST_DEVICE
void
83CalcGrad(
const tensor<real_t, q1d, d1d> &B,
84 const tensor<real_t, q1d, d1d> &G,
85 tensor<real_t,2,3,q1d,q1d,q1d> &smem,
87 tensor<real_t, q1d, q1d, q1d, dim, dim> &dUdxi)
89 for (
int c = 0; c <
dim; ++c)
91 MFEM_FOREACH_THREAD(dz,z,d1d)
93 MFEM_FOREACH_THREAD(dy,y,d1d)
95 MFEM_FOREACH_THREAD(dx,x,d1d)
97 smem(0,0,dx,dy,dz) = U(dx,dy,dz,c);
102 MFEM_FOREACH_THREAD(dz,z,d1d)
104 MFEM_FOREACH_THREAD(dy,y,d1d)
106 MFEM_FOREACH_THREAD(qx,x,q1d)
110 for (
int dx = 0; dx < d1d; ++dx)
112 const real_t input = smem(0,0,dx,dy,dz);
113 u += input * B(qx,dx);
114 v += input * G(qx,dx);
116 smem(0,1,dz,dy,qx) =
u;
117 smem(0,2,dz,dy,qx) = v;
122 MFEM_FOREACH_THREAD(dz,z,d1d)
124 MFEM_FOREACH_THREAD(qy,y,q1d)
126 MFEM_FOREACH_THREAD(qx,x,q1d)
131 for (
int dy = 0; dy < d1d; ++dy)
133 u += smem(0,2,dz,dy,qx) * B(qy,dy);
134 v += smem(0,1,dz,dy,qx) * G(qy,dy);
135 w += smem(0,1,dz,dy,qx) * B(qy,dy);
137 smem(1,0,dz,qy,qx) =
u;
138 smem(1,1,dz,qy,qx) = v;
139 smem(1,2,dz,qy,qx) = w;
144 MFEM_FOREACH_THREAD(qz,z,q1d)
146 MFEM_FOREACH_THREAD(qy,y,q1d)
148 MFEM_FOREACH_THREAD(qx,x,q1d)
153 for (
int dz = 0; dz < d1d; ++dz)
155 u += smem(1,0,dz,qy,qx) * B(qz,dz);
156 v += smem(1,1,dz,qy,qx) * B(qz,dz);
157 w += smem(1,2,dz,qy,qx) * G(qz,dz);
159 dUdxi(qz,qy,qx,c,0) +=
u;
160 dUdxi(qz,qy,qx,c,1) += v;
161 dUdxi(qz,qy,qx,c,2) += w;
187template <
int dim,
int d1d,
int q1d>
188static inline MFEM_HOST_DEVICE
void
189CalcGradTSum(
const tensor<real_t, q1d, d1d> &B,
190 const tensor<real_t, q1d, d1d> &G,
191 tensor<real_t, 2, 3, q1d, q1d, q1d> &smem,
192 const tensor<real_t, q1d, q1d, q1d, dim, dim> &U,
193 DeviceTensor<4, real_t> &F)
195 for (
int c = 0; c <
dim; ++c)
197 MFEM_FOREACH_THREAD(qz, z, q1d)
199 MFEM_FOREACH_THREAD(qy, y, q1d)
201 MFEM_FOREACH_THREAD(dx, x, d1d)
203 real_t u = 0.0, v = 0.0, w = 0.0;
204 for (
int qx = 0; qx < q1d; ++qx)
206 u += U(qx, qy, qz, 0, c) * G(qx, dx);
207 v += U(qx, qy, qz, 1, c) * B(qx, dx);
208 w += U(qx, qy, qz, 2, c) * B(qx, dx);
210 smem(0, 0, qz, qy, dx) =
u;
211 smem(0, 1, qz, qy, dx) = v;
212 smem(0, 2, qz, qy, dx) = w;
217 MFEM_FOREACH_THREAD(qz, z, q1d)
219 MFEM_FOREACH_THREAD(dy, y, d1d)
221 MFEM_FOREACH_THREAD(dx, x, d1d)
223 real_t u = 0.0, v = 0.0, w = 0.0;
224 for (
int qy = 0; qy < q1d; ++qy)
226 u += smem(0, 0, qz, qy, dx) * B(qy, dy);
227 v += smem(0, 1, qz, qy, dx) * G(qy, dy);
228 w += smem(0, 2, qz, qy, dx) * B(qy, dy);
230 smem(1, 0, qz, dy, dx) =
u;
231 smem(1, 1, qz, dy, dx) = v;
232 smem(1, 2, qz, dy, dx) = w;
237 MFEM_FOREACH_THREAD(dz, z, d1d)
239 MFEM_FOREACH_THREAD(dy, y, d1d)
241 MFEM_FOREACH_THREAD(dx, x, d1d)
243 real_t u = 0.0, v = 0.0, w = 0.0;
244 for (
int qz = 0; qz < q1d; ++qz)
246 u += smem(1, 0, qz, dy, dx) * B(qz, dz);
247 v += smem(1, 1, qz, dy, dx) * B(qz, dz);
248 w += smem(1, 2, qz, dy, dx) * G(qz, dz);
251 F(dx, dy, dz, c) += sum;
280template <
int dim,
int d1d,
int q1d>
281static inline MFEM_HOST_DEVICE tensor<real_t, d1d, d1d, d1d, dim>
282GradAllShapeFunctions(
int qx,
int qy,
int qz,
283 const tensor<real_t, q1d, d1d> &B,
284 const tensor<real_t, q1d, d1d> &G,
285 const tensor<real_t, dim, dim> &invJ)
287 tensor<real_t, d1d, d1d, d1d, dim> dphi_dx;
291 for (
int dx = 0; dx < d1d; dx++)
293 for (
int dy = 0; dy < d1d; dy++)
295 for (
int dz = 0; dz < d1d; dz++)
299 tensor<real_t, dim> {G(qx, dx) * B(qy, dy) * B(qz, dz),
300 B(qx, dx) * G(qy, dy) * B(qz, dz),
301 B(qx, dx) * B(qy, dy) * G(qz, dz)