12#ifndef MFEM_DGMASSINV_KERNELS_HPP
13#define MFEM_DGMASSINV_KERNELS_HPP
25template <
int DIM,
int D1D,
int Q1D>
26MFEM_HOST_DEVICE
inline
27void DGMassApply(
const int e,
37 constexpr bool use_smem = (D1D > 0 && Q1D > 0);
38 constexpr bool ACCUM =
false;
39 constexpr int NBZ = 1;
43 PAMassApply1D_Element<ACCUM>(e, NE, B, Bt, pa_data, x, y, d1d, q1d);
51 constexpr int TD1D = D1D ? D1D : 1;
52 constexpr int TQ1D = Q1D ? Q1D : 1;
55 SmemPAMassApply2D_Element<TD1D,TQ1D,NBZ,ACCUM>(e, NE, B, pa_data, x, y);
59 SmemPAMassApply3D_Element<TD1D,TQ1D,ACCUM>(e, NE, B, pa_data, x, y);
63 MFEM_ABORT_KERNEL(
"Unsupported dimension.");
70 PAMassApply2D_Element<ACCUM>(e, NE, B, Bt, pa_data, x, y, d1d, q1d);
74 PAMassApply3D_Element<ACCUM>(e, NE, B, Bt, pa_data, x, y, d1d, q1d);
78 MFEM_ABORT_KERNEL(
"Unsupported dimension.");
83MFEM_HOST_DEVICE
inline
84void DGMassPreconditioner(
const int e,
95 const int tid = MFEM_THREAD_ID(x) + MFEM_THREAD_SIZE(x)*MFEM_THREAD_ID(y);
96 const int bxy = MFEM_THREAD_SIZE(x)*MFEM_THREAD_SIZE(y);
98 for (
int i = tid; i < ND; i += bxy)
100 Y(i, e) = D(i, e)*X(i, e);
105MFEM_HOST_DEVICE
inline
106void DGMassAxpy(
const int e,
119 const int tid = MFEM_THREAD_ID(x) + MFEM_THREAD_SIZE(x)*MFEM_THREAD_ID(y);
120 const int bxy = MFEM_THREAD_SIZE(x)*MFEM_THREAD_SIZE(y);
122 for (
int i = tid; i < ND; i += bxy)
124 Z(i, e) =
a*X(i, e) +
b*Y(i, e);
130MFEM_HOST_DEVICE
inline
131real_t DGMassDot(
const int e,
140 const int tid = MFEM_THREAD_ID(x) + MFEM_THREAD_SIZE(x)*MFEM_THREAD_ID(y);
141 const int bxy = MFEM_THREAD_SIZE(x)*MFEM_THREAD_SIZE(y);
143 MFEM_SHARED
real_t s_dot[NB*NB];
146 for (
int i = tid; i < ND; i += bxy) { s_dot[tid] += X(i,e)*Y(i,e); }
149 if (bxy > 512 && tid + 512 < bxy) { s_dot[tid] += s_dot[tid + 512]; }
152 if (bxy > 256 && tid < 256 && tid + 256 < bxy) { s_dot[tid] += s_dot[tid + 256]; }
155 if (bxy > 128 && tid < 128 && tid + 128 < bxy) { s_dot[tid] += s_dot[tid + 128]; }
158 if (bxy > 64 && tid < 64 && tid + 64 < bxy) { s_dot[tid] += s_dot[tid + 64]; }
161 if (bxy > 32 && tid < 32 && tid + 32 < bxy) { s_dot[tid] += s_dot[tid + 32]; }
164 if (bxy > 16 && tid < 16 && tid + 16 < bxy) { s_dot[tid] += s_dot[tid + 16]; }
167 if (bxy > 8 && tid < 8 && tid + 8 < bxy) { s_dot[tid] += s_dot[tid + 8]; }
170 if (bxy > 4 && tid < 4 && tid + 4 < bxy) { s_dot[tid] += s_dot[tid + 4]; }
173 if (bxy > 2 && tid < 2 && tid + 2 < bxy) { s_dot[tid] += s_dot[tid + 2]; }
176 if (bxy > 1 && tid < 1 && tid + 1 < bxy) { s_dot[tid] += s_dot[tid + 1]; }
182template<
int T_D1D = 0>
183MFEM_HOST_DEVICE
inline
184void DGMassBasis1D(
const int e,
191 const int D1D = T_D1D ? T_D1D : d1d;
193 const auto b =
Reshape(b_, D1D, D1D);
194 const auto x =
Reshape(x_, D1D, NE);
197 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
200 MFEM_FOREACH_THREAD(i,x,D1D)
203 for (
int j = 0; j < D1D; ++j)
205 val +=
b(i,j)*x(j,e);
210 if (MFEM_THREAD_ID(y) == 0)
212 MFEM_FOREACH_THREAD(i,x,D1D)
219template<
int T_D1D = 0>
220MFEM_HOST_DEVICE
inline
221void DGMassBasis2D(
const int e,
228 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
229 const int D1D = T_D1D ? T_D1D : d1d;
231 const auto b =
Reshape(b_, D1D, D1D);
232 const auto x =
Reshape(x_, D1D, D1D, NE);
233 auto y =
Reshape(y_, D1D, D1D, NE);
235 MFEM_SHARED
real_t sB[MD1*MD1];
236 MFEM_SHARED
real_t sm0[MD1*MD1];
237 MFEM_SHARED
real_t sm1[MD1*MD1];
239 kernels::internal::LoadB<MD1,MD1>(D1D,D1D,
b,sB);
246 kernels::internal::LoadX(e,D1D,x,DD);
247 kernels::internal::EvalX(D1D,D1D,B,DD,DQ);
248 kernels::internal::EvalY(D1D,D1D,B,DQ,QQ);
250 MFEM_FOREACH_THREAD(qy,y,D1D)
252 MFEM_FOREACH_THREAD(qx,x,D1D)
254 y(qx,qy,e) = QQ(qx,qy);
260template<
int T_D1D = 0>
261MFEM_HOST_DEVICE
inline
262void DGMassBasis3D(
const int e,
269 const int D1D = T_D1D ? T_D1D : d1d;
271 const auto b =
Reshape(b_, D1D, D1D);
272 const auto x =
Reshape(x_, D1D, D1D, D1D, NE);
273 auto y =
Reshape(y_, D1D, D1D, D1D, NE);
275 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
277 MFEM_SHARED
real_t sB[MD1*MD1];
278 MFEM_SHARED
real_t sm0[MD1*MD1*MD1];
279 MFEM_SHARED
real_t sm1[MD1*MD1*MD1];
281 kernels::internal::LoadB<MD1,MD1>(D1D,D1D,
b,sB);
289 kernels::internal::LoadX(e,D1D,x,DDD);
290 kernels::internal::EvalX(D1D,D1D,B,DDD,DDQ);
291 kernels::internal::EvalY(D1D,D1D,B,DDQ,DQQ);
292 kernels::internal::EvalZ(D1D,D1D,B,DQQ,QQQ);
294 MFEM_FOREACH_THREAD(qz,z,D1D)
296 MFEM_FOREACH_THREAD(qy,y,D1D)
298 for (
int qx = 0; qx < D1D; ++qx)
300 y(qx,qy,qz,e) = QQQ(qz,qy,qx);
307template<
int DIM,
int T_D1D = 0>
308MFEM_HOST_DEVICE
inline
309void DGMassBasis(
const int e,
318 DGMassBasis1D<T_D1D>(e, NE, b_, x_, y_, d1d);
322 DGMassBasis2D<T_D1D>(e, NE, b_, x_, y_, d1d);
326 DGMassBasis3D<T_D1D>(e, NE, b_, x_, y_, d1d);
330 MFEM_ABORT_KERNEL(
"Dimension not supported.");
DeviceTensor< 3, real_t > DeviceCube
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
DeviceTensor< 2, const real_t > ConstDeviceMatrix
DeviceTensor< 2, real_t > DeviceMatrix