12 #ifndef MFEM_ELASTICITY_KERNEL_HELPERS_HPP
13 #define MFEM_ELASTICITY_KERNEL_HELPERS_HPP
19 using mfem::internal::tensor;
24 namespace KernelHelpers
29 #if defined(__CUDA_ARCH__)
30 #define MFEM_SHARED_3D_BLOCK_TENSOR(name,T,bx,by,bz,...)\
31 MFEM_SHARED tensor<T,bx,by,bz,__VA_ARGS__> name;\
32 name(threadIdx.x, threadIdx.y, threadIdx.z) = tensor<T,__VA_ARGS__> {};
34 #define MFEM_SHARED_3D_BLOCK_TENSOR(name,...) tensor<__VA_ARGS__> name {};
48 MFEM_VERIFY(d1d <= q1d,
49 "There should be more or equal quadrature points "
52 "Maximum number of degrees of freedom in 1D reached."
53 "This number can be increased globally in general/forall.hpp if "
54 "device memory allows.");
55 MFEM_VERIFY(q1d <=
MAX_Q1D,
"Maximum quadrature points 1D reached."
56 "This number can be increased globally in "
57 "general/forall.hpp if device memory allows.");
81 template <
int dim,
int d1d,
int q1d>
82 static inline MFEM_HOST_DEVICE
void
83 CalcGrad(
const tensor<double, q1d, d1d> &B,
84 const tensor<double, q1d, d1d> &G,
85 tensor<double,2,3,q1d,q1d,q1d> &smem,
87 tensor<double, q1d, q1d, q1d, dim, dim> &dUdxi)
89 for (
int c = 0; c <
dim; ++c)
91 MFEM_FOREACH_THREAD(dz,z,d1d)
93 MFEM_FOREACH_THREAD(dy,y,d1d)
95 MFEM_FOREACH_THREAD(dx,x,d1d)
97 smem(0,0,dx,dy,dz) = U(dx,dy,dz,c);
102 MFEM_FOREACH_THREAD(dz,z,d1d)
104 MFEM_FOREACH_THREAD(dy,y,d1d)
106 MFEM_FOREACH_THREAD(qx,x,q1d)
110 for (
int dx = 0; dx < d1d; ++dx)
112 const double input = smem(0,0,dx,dy,dz);
113 u += input * B(qx,dx);
114 v += input * G(qx,dx);
116 smem(0,1,dz,dy,qx) =
u;
117 smem(0,2,dz,dy,qx) = v;
122 MFEM_FOREACH_THREAD(dz,z,d1d)
124 MFEM_FOREACH_THREAD(qy,y,q1d)
126 MFEM_FOREACH_THREAD(qx,x,q1d)
131 for (
int dy = 0; dy < d1d; ++dy)
133 u += smem(0,2,dz,dy,qx) * B(qy,dy);
134 v += smem(0,1,dz,dy,qx) * G(qy,dy);
135 w += smem(0,1,dz,dy,qx) * B(qy,dy);
137 smem(1,0,dz,qy,qx) =
u;
138 smem(1,1,dz,qy,qx) = v;
139 smem(1,2,dz,qy,qx) = w;
144 MFEM_FOREACH_THREAD(qz,z,q1d)
146 MFEM_FOREACH_THREAD(qy,y,q1d)
148 MFEM_FOREACH_THREAD(qx,x,q1d)
153 for (
int dz = 0; dz < d1d; ++dz)
155 u += smem(1,0,dz,qy,qx) * B(qz,dz);
156 v += smem(1,1,dz,qy,qx) * B(qz,dz);
157 w += smem(1,2,dz,qy,qx) * G(qz,dz);
159 dUdxi(qz,qy,qx,c,0) +=
u;
160 dUdxi(qz,qy,qx,c,1) += v;
161 dUdxi(qz,qy,qx,c,2) += w;
187 template <
int dim,
int d1d,
int q1d>
188 static inline MFEM_HOST_DEVICE
void
189 CalcGradTSum(
const tensor<double, q1d, d1d> &B,
190 const tensor<double, q1d, d1d> &G,
191 tensor<double, 2, 3, q1d, q1d, q1d> &smem,
192 const tensor<double, q1d, q1d, q1d, dim, dim> &U,
193 DeviceTensor<4, double> &F)
195 for (
int c = 0; c <
dim; ++c)
197 MFEM_FOREACH_THREAD(qz, z, q1d)
199 MFEM_FOREACH_THREAD(qy, y, q1d)
201 MFEM_FOREACH_THREAD(dx, x, d1d)
203 double u = 0.0, v = 0.0, w = 0.0;
204 for (
int qx = 0; qx < q1d; ++qx)
206 u += U(qx, qy, qz, 0, c) * G(qx, dx);
207 v += U(qx, qy, qz, 1, c) * B(qx, dx);
208 w += U(qx, qy, qz, 2, c) * B(qx, dx);
210 smem(0, 0, qz, qy, dx) =
u;
211 smem(0, 1, qz, qy, dx) = v;
212 smem(0, 2, qz, qy, dx) = w;
217 MFEM_FOREACH_THREAD(qz, z, q1d)
219 MFEM_FOREACH_THREAD(dy, y, d1d)
221 MFEM_FOREACH_THREAD(dx, x, d1d)
223 double u = 0.0, v = 0.0, w = 0.0;
224 for (
int qy = 0; qy < q1d; ++qy)
226 u += smem(0, 0, qz, qy, dx) * B(qy, dy);
227 v += smem(0, 1, qz, qy, dx) * G(qy, dy);
228 w += smem(0, 2, qz, qy, dx) * B(qy, dy);
230 smem(1, 0, qz, dy, dx) =
u;
231 smem(1, 1, qz, dy, dx) = v;
232 smem(1, 2, qz, dy, dx) = w;
237 MFEM_FOREACH_THREAD(dz, z, d1d)
239 MFEM_FOREACH_THREAD(dy, y, d1d)
241 MFEM_FOREACH_THREAD(dx, x, d1d)
243 double u = 0.0, v = 0.0, w = 0.0;
244 for (
int qz = 0; qz < q1d; ++qz)
246 u += smem(1, 0, qz, dy, dx) * B(qz, dz);
247 v += smem(1, 1, qz, dy, dx) * B(qz, dz);
248 w += smem(1, 2, qz, dy, dx) * G(qz, dz);
250 const double sum = u + v + w;
251 F(dx, dy, dz, c) += sum;
280 template <
int dim,
int d1d,
int q1d>
281 static inline MFEM_HOST_DEVICE tensor<double, d1d, d1d, d1d, dim>
282 GradAllShapeFunctions(
int qx,
int qy,
int qz,
283 const tensor<double, q1d, d1d> &B,
284 const tensor<double, q1d, d1d> &G,
285 const tensor<double, dim, dim> &invJ)
287 tensor<double, d1d, d1d, d1d, dim> dphi_dx;
291 for (
int dx = 0; dx < d1d; dx++)
293 for (
int dy = 0; dy < d1d; dy++)
295 for (
int dz = 0; dz < d1d; dz++)
299 tensor<double, dim> {G(qx, dx) * B(qy, dy) * B(qz, dz),
300 B(qx, dx) * G(qy, dy) * B(qz, dz),
301 B(qx, dx) * B(qy, dy) * G(qz, dz)
void CheckMemoryRestriction(int d1d, int q1d)
Runtime check for memory restrictions that are determined at compile time.
A basic generic Tensor class, appropriate for use on the GPU.
Implementation of the tensor class.
double u(const Vector &xvec)