12#ifndef MFEM_BILININTEG_MASS_KERNELS_HPP
13#define MFEM_BILININTEG_MASS_KERNELS_HPP
31static void PAMassAssembleDiagonal1D(
const int NE,
32 const Array<real_t> &
b,
38 auto B =
Reshape(
b.Read(), Q1D, D1D);
39 auto D =
Reshape(d.Read(), Q1D, NE);
40 auto Y =
Reshape(y.ReadWrite(), D1D, NE);
43 for (
int dx = 0; dx < D1D; ++dx)
45 for (
int qx = 0; qx < Q1D; ++qx)
47 Y(dx, e) += B(qx, dx) * B(qx, dx) * D(qx, e);
53MFEM_HOST_DEVICE
inline
54void PAMassApply1D_Element(
const int e,
72 real_t XQ[DofQuadLimits::MAX_Q1D];
73 for (
int qx = 0; qx < Q1D; ++qx)
77 for (
int dx = 0; dx < D1D; ++dx)
80 for (
int qx = 0; qx < Q1D; ++qx)
85 for (
int qx = 0; qx < Q1D; ++qx)
87 const double q = XQ[qx]*D(qx,e);
88 for (
int dx = 0; dx < D1D; ++dx)
90 Y(dx,e) += Bt(dx,qx) * q;
96static void PAMassApply1D(
const int NE,
97 const Array<real_t> &b_,
98 const Array<real_t> &bt_,
108 const auto B = b_.Read();
109 const auto Bt = bt_.Read();
110 const auto D = d_.Read();
111 const auto X = x_.Read();
112 auto Y = y_.ReadWrite();
116 internal::PAMassApply1D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
121template<
int T_D1D = 0,
int T_Q1D = 0>
122inline void PAMassAssembleDiagonal2D(
const int NE,
123 const Array<real_t> &
b,
129 const int D1D = T_D1D ? T_D1D : d1d;
130 const int Q1D = T_Q1D ? T_Q1D : q1d;
133 auto B =
Reshape(
b.Read(), Q1D, D1D);
134 auto D =
Reshape(d.Read(), Q1D, Q1D, NE);
135 auto Y =
Reshape(y.ReadWrite(), D1D, D1D, NE);
138 const int D1D = T_D1D ? T_D1D : d1d;
139 const int Q1D = T_Q1D ? T_Q1D : q1d;
140 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
141 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
143 for (
int qx = 0; qx < Q1D; ++qx)
145 for (
int dy = 0; dy < D1D; ++dy)
148 for (
int qy = 0; qy < Q1D; ++qy)
150 QD[qx][dy] += B(qy, dy) * B(qy, dy) * D(qx, qy, e);
154 for (
int dy = 0; dy < D1D; ++dy)
156 for (
int dx = 0; dx < D1D; ++dx)
158 for (
int qx = 0; qx < Q1D; ++qx)
160 Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD[qx][dy];
169constexpr int ipow(
int x,
int p) {
return p == 0 ? 1 : x*ipow(x,
p-1); }
170constexpr int D(
int D1D) {
return (11 - D1D) / 2; }
171constexpr int NBZ(
int D1D)
173 return ipow(2, D(D1D) >= 0 ? D(D1D) : 0);
178template<
int T_D1D = 0,
int T_Q1D = 0>
179inline void SmemPAMassAssembleDiagonal2D(
const int NE,
180 const Array<real_t> &b_,
186 static constexpr int T_NBZ = mass::NBZ(T_D1D);
187 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
188 const int D1D = T_D1D ? T_D1D : d1d;
189 const int Q1D = T_Q1D ? T_Q1D : q1d;
192 MFEM_VERIFY(D1D <= max_d1d,
"");
193 MFEM_VERIFY(Q1D <= max_q1d,
"");
194 auto b =
Reshape(b_.Read(), Q1D, D1D);
195 auto D =
Reshape(d_.Read(), Q1D, Q1D, NE);
196 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
199 const int tidz = MFEM_THREAD_ID(z);
200 const int D1D = T_D1D ? T_D1D : d1d;
201 const int Q1D = T_Q1D ? T_Q1D : q1d;
202 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
203 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
204 MFEM_SHARED
real_t B[MQ1][MD1];
205 MFEM_SHARED
real_t QDZ[NBZ][MQ1][MD1];
209 MFEM_FOREACH_THREAD(d,y,D1D)
211 MFEM_FOREACH_THREAD(q,x,Q1D)
218 MFEM_FOREACH_THREAD(qx,x,Q1D)
220 MFEM_FOREACH_THREAD(dy,y,D1D)
223 for (
int qy = 0; qy < Q1D; ++qy)
225 QD[qx][dy] += B[qy][dy] * B[qy][dy] * D(qx, qy, e);
230 MFEM_FOREACH_THREAD(dy,y,D1D)
232 MFEM_FOREACH_THREAD(dx,x,D1D)
234 for (
int qx = 0; qx < Q1D; ++qx)
237 Y(dx,dy,e) += B[qx][dx] * B[qx][dx] * QD[qx][dy];
245template<
int T_D1D = 0,
int T_Q1D = 0>
246inline void PAMassAssembleDiagonal3D(
const int NE,
247 const Array<real_t> &
b,
253 const int D1D = T_D1D ? T_D1D : d1d;
254 const int Q1D = T_Q1D ? T_Q1D : q1d;
257 auto B =
Reshape(
b.Read(), Q1D, D1D);
258 auto D =
Reshape(d.Read(), Q1D, Q1D, Q1D, NE);
259 auto Y =
Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
262 const int D1D = T_D1D ? T_D1D : d1d;
263 const int Q1D = T_Q1D ? T_Q1D : q1d;
264 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
265 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
266 real_t QQD[MQ1][MQ1][MD1];
267 real_t QDD[MQ1][MD1][MD1];
268 for (
int qx = 0; qx < Q1D; ++qx)
270 for (
int qy = 0; qy < Q1D; ++qy)
272 for (
int dz = 0; dz < D1D; ++dz)
274 QQD[qx][qy][dz] = 0.0;
275 for (
int qz = 0; qz < Q1D; ++qz)
277 QQD[qx][qy][dz] += B(qz, dz) * B(qz, dz) * D(qx, qy, qz, e);
282 for (
int qx = 0; qx < Q1D; ++qx)
284 for (
int dz = 0; dz < D1D; ++dz)
286 for (
int dy = 0; dy < D1D; ++dy)
288 QDD[qx][dy][dz] = 0.0;
289 for (
int qy = 0; qy < Q1D; ++qy)
291 QDD[qx][dy][dz] += B(qy, dy) * B(qy, dy) * QQD[qx][qy][dz];
296 for (
int dz = 0; dz < D1D; ++dz)
298 for (
int dy = 0; dy < D1D; ++dy)
300 for (
int dx = 0; dx < D1D; ++dx)
303 for (
int qx = 0; qx < Q1D; ++qx)
305 t += B(qx, dx) * B(qx, dx) * QDD[qx][dy][dz];
307 Y(dx, dy, dz, e) += t;
315template<
int T_D1D = 0,
int T_Q1D = 0>
316inline void SmemPAMassAssembleDiagonal3D(
const int NE,
317 const Array<real_t> &b_,
323 const int D1D = T_D1D ? T_D1D : d1d;
324 const int Q1D = T_Q1D ? T_Q1D : q1d;
327 MFEM_VERIFY(D1D <= max_d1d,
"");
328 MFEM_VERIFY(Q1D <= max_q1d,
"");
329 auto b =
Reshape(b_.Read(), Q1D, D1D);
330 auto D =
Reshape(d_.Read(), Q1D, Q1D, Q1D, NE);
331 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
334 const int tidz = MFEM_THREAD_ID(z);
335 const int D1D = T_D1D ? T_D1D : d1d;
336 const int Q1D = T_Q1D ? T_Q1D : q1d;
337 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
338 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
339 MFEM_SHARED
real_t B[MQ1][MD1];
340 MFEM_SHARED
real_t QQD[MQ1][MQ1][MD1];
341 MFEM_SHARED
real_t QDD[MQ1][MD1][MD1];
344 MFEM_FOREACH_THREAD(d,y,D1D)
346 MFEM_FOREACH_THREAD(q,x,Q1D)
353 MFEM_FOREACH_THREAD(qx,x,Q1D)
355 MFEM_FOREACH_THREAD(qy,y,Q1D)
357 MFEM_FOREACH_THREAD(dz,z,D1D)
359 QQD[qx][qy][dz] = 0.0;
360 for (
int qz = 0; qz < Q1D; ++qz)
362 QQD[qx][qy][dz] += B[qz][dz] * B[qz][dz] * D(qx, qy, qz, e);
368 MFEM_FOREACH_THREAD(qx,x,Q1D)
370 MFEM_FOREACH_THREAD(dz,z,D1D)
372 MFEM_FOREACH_THREAD(dy,y,D1D)
374 QDD[qx][dy][dz] = 0.0;
375 for (
int qy = 0; qy < Q1D; ++qy)
377 QDD[qx][dy][dz] += B[qy][dy] * B[qy][dy] * QQD[qx][qy][dz];
383 MFEM_FOREACH_THREAD(dz,z,D1D)
385 MFEM_FOREACH_THREAD(dy,y,D1D)
387 MFEM_FOREACH_THREAD(dx,x,D1D)
390 for (
int qx = 0; qx < Q1D; ++qx)
392 t += B[qx][dx] * B[qx][dx] * QDD[qx][dy][dz];
394 Y(dx, dy, dz, e) += t;
403void OccaPAMassApply2D(
const int D1D,
406 const Array<real_t> &B,
407 const Array<real_t> &Bt,
413void OccaPAMassApply3D(
const int D1D,
416 const Array<real_t> &B,
417 const Array<real_t> &Bt,
423template <
bool ACCUMULATE = true>
424MFEM_HOST_DEVICE
inline
425void PAMassApply2D_Element(
const int e,
445 for (
int dy = 0; dy < D1D; ++dy)
447 for (
int dx = 0; dx < D1D; ++dx)
454 constexpr int max_D1D = DofQuadLimits::MAX_D1D;
455 constexpr int max_Q1D = DofQuadLimits::MAX_Q1D;
456 real_t sol_xy[max_Q1D][max_Q1D];
457 for (
int qy = 0; qy < Q1D; ++qy)
459 for (
int qx = 0; qx < Q1D; ++qx)
461 sol_xy[qy][qx] = 0.0;
464 for (
int dy = 0; dy < D1D; ++dy)
467 for (
int qy = 0; qy < Q1D; ++qy)
471 for (
int dx = 0; dx < D1D; ++dx)
473 const real_t s = X(dx,dy,e);
474 for (
int qx = 0; qx < Q1D; ++qx)
476 sol_x[qx] += B(qx,dx)* s;
479 for (
int qy = 0; qy < Q1D; ++qy)
481 const real_t d2q = B(qy,dy);
482 for (
int qx = 0; qx < Q1D; ++qx)
484 sol_xy[qy][qx] += d2q * sol_x[qx];
488 for (
int qy = 0; qy < Q1D; ++qy)
490 for (
int qx = 0; qx < Q1D; ++qx)
492 sol_xy[qy][qx] *= D(qx,qy,e);
495 for (
int qy = 0; qy < Q1D; ++qy)
498 for (
int dx = 0; dx < D1D; ++dx)
502 for (
int qx = 0; qx < Q1D; ++qx)
504 const real_t s = sol_xy[qy][qx];
505 for (
int dx = 0; dx < D1D; ++dx)
507 sol_x[dx] += Bt(dx,qx) * s;
510 for (
int dy = 0; dy < D1D; ++dy)
512 const real_t q2d = Bt(dy,qy);
513 for (
int dx = 0; dx < D1D; ++dx)
515 Y(dx,dy,e) += q2d * sol_x[dx];
521template<
int T_D1D,
int T_Q1D,
int T_NBZ,
bool ACCUMULATE = true>
522MFEM_HOST_DEVICE
inline
523void SmemPAMassApply2D_Element(
const int e,
532 const int D1D = T_D1D ? T_D1D : d1d;
533 const int Q1D = T_Q1D ? T_Q1D : q1d;
534 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
536 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
537 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
538 constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
545 const int tidz = MFEM_THREAD_ID(z);
547 MFEM_SHARED
real_t BBt[MQ1*MD1];
550 MFEM_SHARED
real_t sm0[NBZ][MDQ*MDQ];
551 MFEM_SHARED
real_t sm1[NBZ][MDQ*MDQ];
558 MFEM_FOREACH_THREAD(dy,y,D1D)
560 MFEM_FOREACH_THREAD(dx,x,D1D)
562 X[dy][dx] = x(dx,dy,e);
567 MFEM_FOREACH_THREAD(dy,y,D1D)
569 MFEM_FOREACH_THREAD(q,x,Q1D)
576 MFEM_FOREACH_THREAD(dy,y,D1D)
578 MFEM_FOREACH_THREAD(qx,x,Q1D)
581 for (
int dx = 0; dx < D1D; ++dx)
583 dq += X[dy][dx] * B[qx][dx];
589 MFEM_FOREACH_THREAD(qy,y,Q1D)
591 MFEM_FOREACH_THREAD(qx,x,Q1D)
594 for (
int dy = 0; dy < D1D; ++dy)
596 qq += DQ[dy][qx] * B[qy][dy];
598 QQ[qy][qx] = qq * D(qx, qy, e);
604 MFEM_FOREACH_THREAD(dy,y,D1D)
606 MFEM_FOREACH_THREAD(q,x,Q1D)
613 MFEM_FOREACH_THREAD(qy,y,Q1D)
615 MFEM_FOREACH_THREAD(dx,x,D1D)
618 for (
int qx = 0; qx < Q1D; ++qx)
620 dq += QQ[qy][qx] * Bt[dx][qx];
626 MFEM_FOREACH_THREAD(dy,y,D1D)
628 MFEM_FOREACH_THREAD(dx,x,D1D)
631 for (
int qy = 0; qy < Q1D; ++qy)
633 dd += (QD[qy][dx] * Bt[dy][qy]);
647template <
bool ACCUMULATE = true>
648MFEM_HOST_DEVICE
inline
649void PAMassApply3D_Element(
const int e,
663 auto D = DeviceTensor<4,const real_t>(d_, Q1D, Q1D, Q1D, NE);
664 auto X = DeviceTensor<4,const real_t>(x_, D1D, D1D, D1D, NE);
665 auto Y = DeviceTensor<4,real_t>(y_, D1D, D1D, D1D, NE);
669 for (
int dz = 0; dz < D1D; ++dz)
671 for (
int dy = 0; dy < D1D; ++dy)
673 for (
int dx = 0; dx < D1D; ++dx)
675 Y(dx, dy, dz, e) = 0.0;
681 constexpr int max_D1D = DofQuadLimits::MAX_D1D;
682 constexpr int max_Q1D = DofQuadLimits::MAX_Q1D;
683 real_t sol_xyz[max_Q1D][max_Q1D][max_Q1D];
684 for (
int qz = 0; qz < Q1D; ++qz)
686 for (
int qy = 0; qy < Q1D; ++qy)
688 for (
int qx = 0; qx < Q1D; ++qx)
690 sol_xyz[qz][qy][qx] = 0.0;
694 for (
int dz = 0; dz < D1D; ++dz)
696 real_t sol_xy[max_Q1D][max_Q1D];
697 for (
int qy = 0; qy < Q1D; ++qy)
699 for (
int qx = 0; qx < Q1D; ++qx)
701 sol_xy[qy][qx] = 0.0;
704 for (
int dy = 0; dy < D1D; ++dy)
707 for (
int qx = 0; qx < Q1D; ++qx)
711 for (
int dx = 0; dx < D1D; ++dx)
713 const real_t s = X(dx,dy,dz,e);
714 for (
int qx = 0; qx < Q1D; ++qx)
716 sol_x[qx] += B(qx,dx) * s;
719 for (
int qy = 0; qy < Q1D; ++qy)
721 const real_t wy = B(qy,dy);
722 for (
int qx = 0; qx < Q1D; ++qx)
724 sol_xy[qy][qx] += wy * sol_x[qx];
728 for (
int qz = 0; qz < Q1D; ++qz)
730 const real_t wz = B(qz,dz);
731 for (
int qy = 0; qy < Q1D; ++qy)
733 for (
int qx = 0; qx < Q1D; ++qx)
735 sol_xyz[qz][qy][qx] += wz * sol_xy[qy][qx];
740 for (
int qz = 0; qz < Q1D; ++qz)
742 for (
int qy = 0; qy < Q1D; ++qy)
744 for (
int qx = 0; qx < Q1D; ++qx)
746 sol_xyz[qz][qy][qx] *= D(qx,qy,qz,e);
750 for (
int qz = 0; qz < Q1D; ++qz)
752 real_t sol_xy[max_D1D][max_D1D];
753 for (
int dy = 0; dy < D1D; ++dy)
755 for (
int dx = 0; dx < D1D; ++dx)
760 for (
int qy = 0; qy < Q1D; ++qy)
763 for (
int dx = 0; dx < D1D; ++dx)
767 for (
int qx = 0; qx < Q1D; ++qx)
769 const real_t s = sol_xyz[qz][qy][qx];
770 for (
int dx = 0; dx < D1D; ++dx)
772 sol_x[dx] += Bt(dx,qx) * s;
775 for (
int dy = 0; dy < D1D; ++dy)
777 const real_t wy = Bt(dy,qy);
778 for (
int dx = 0; dx < D1D; ++dx)
780 sol_xy[dy][dx] += wy * sol_x[dx];
784 for (
int dz = 0; dz < D1D; ++dz)
786 const real_t wz = Bt(dz,qz);
787 for (
int dy = 0; dy < D1D; ++dy)
789 for (
int dx = 0; dx < D1D; ++dx)
791 Y(dx,dy,dz,e) += wz * sol_xy[dy][dx];
798template<
int T_D1D,
int T_Q1D,
bool ACCUMULATE = true>
799MFEM_HOST_DEVICE
inline
800void SmemPAMassApply3D_Element(
const int e,
809 constexpr int D1D = T_D1D ? T_D1D : d1d;
810 constexpr int Q1D = T_Q1D ? T_Q1D : q1d;
811 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
812 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
813 constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
816 auto d = DeviceTensor<4,const real_t>(d_, Q1D, Q1D, Q1D, NE);
817 auto x = DeviceTensor<4,const real_t>(x_, D1D, D1D, D1D, NE);
818 auto y = DeviceTensor<4,real_t>(y_, D1D, D1D, D1D, NE);
820 MFEM_SHARED
real_t sDQ[MQ1*MD1];
823 MFEM_SHARED
real_t sm0[MDQ*MDQ*MDQ];
824 MFEM_SHARED
real_t sm1[MDQ*MDQ*MDQ];
831 MFEM_FOREACH_THREAD(dy,y,D1D)
833 MFEM_FOREACH_THREAD(dx,x,D1D)
836 for (
int dz = 0; dz < D1D; ++dz)
838 X[dz][dy][dx] = x(dx,dy,dz,e);
841 MFEM_FOREACH_THREAD(dx,x,Q1D)
843 B[dx][dy] =
b(dx,dy);
847 MFEM_FOREACH_THREAD(dy,y,D1D)
849 MFEM_FOREACH_THREAD(qx,x,Q1D)
853 for (
int dz = 0; dz < D1D; dz++)
858 for (
int dx = 0; dx < D1D; ++dx)
861 for (
int dz = 0; dz < D1D; ++dz)
863 u[dz] += X[dz][dy][dx] * B[qx][dx];
867 for (
int dz = 0; dz < D1D; ++dz)
869 DDQ[dz][dy][qx] =
u[dz];
874 MFEM_FOREACH_THREAD(qy,y,Q1D)
876 MFEM_FOREACH_THREAD(qx,x,Q1D)
880 for (
int dz = 0; dz < D1D; dz++)
885 for (
int dy = 0; dy < D1D; ++dy)
888 for (
int dz = 0; dz < D1D; dz++)
890 u[dz] += DDQ[dz][dy][qx] * B[qy][dy];
894 for (
int dz = 0; dz < D1D; dz++)
896 DQQ[dz][qy][qx] =
u[dz];
901 MFEM_FOREACH_THREAD(qy,y,Q1D)
903 MFEM_FOREACH_THREAD(qx,x,Q1D)
907 for (
int qz = 0; qz < Q1D; qz++)
912 for (
int dz = 0; dz < D1D; ++dz)
915 for (
int qz = 0; qz < Q1D; qz++)
917 u[qz] += DQQ[dz][qy][qx] * B[qz][dz];
921 for (
int qz = 0; qz < Q1D; qz++)
923 QQQ[qz][qy][qx] =
u[qz] * d(qx,qy,qz,e);
928 MFEM_FOREACH_THREAD(di,y,D1D)
930 MFEM_FOREACH_THREAD(q,x,Q1D)
936 MFEM_FOREACH_THREAD(qy,y,Q1D)
938 MFEM_FOREACH_THREAD(dx,x,D1D)
942 for (
int qz = 0; qz < Q1D; ++qz)
947 for (
int qx = 0; qx < Q1D; ++qx)
950 for (
int qz = 0; qz < Q1D; ++qz)
952 u[qz] += QQQ[qz][qy][qx] * Bt[dx][qx];
956 for (
int qz = 0; qz < Q1D; ++qz)
958 QQD[qz][qy][dx] =
u[qz];
963 MFEM_FOREACH_THREAD(dy,y,D1D)
965 MFEM_FOREACH_THREAD(dx,x,D1D)
969 for (
int qz = 0; qz < Q1D; ++qz)
974 for (
int qy = 0; qy < Q1D; ++qy)
977 for (
int qz = 0; qz < Q1D; ++qz)
979 u[qz] += QQD[qz][qy][dx] * Bt[dy][qy];
983 for (
int qz = 0; qz < Q1D; ++qz)
985 QDD[qz][dy][dx] =
u[qz];
990 MFEM_FOREACH_THREAD(dy,y,D1D)
992 MFEM_FOREACH_THREAD(dx,x,D1D)
996 for (
int dz = 0; dz < D1D; ++dz)
1001 for (
int qz = 0; qz < Q1D; ++qz)
1004 for (
int dz = 0; dz < D1D; ++dz)
1006 u[dz] += QDD[qz][dy][dx] * Bt[dz][qz];
1010 for (
int dz = 0; dz < D1D; ++dz)
1014 y(dx,dy,dz,e) +=
u[dz];
1018 y(dx,dy,dz,e) =
u[dz];
1027template<
int T_D1D = 0,
int T_Q1D = 0>
1028inline void PAMassApply2D(
const int NE,
1029 const Array<real_t> &b_,
1030 const Array<real_t> &bt_,
1037 MFEM_VERIFY(T_D1D ? T_D1D : d1d <= DeviceDofQuadLimits::Get().MAX_D1D,
"");
1038 MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= DeviceDofQuadLimits::Get().MAX_Q1D,
"");
1040 const auto B = b_.Read();
1041 const auto Bt = bt_.Read();
1042 const auto D = d_.Read();
1043 const auto X = x_.Read();
1044 auto Y = y_.ReadWrite();
1048 internal::PAMassApply2D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
1053template<
int T_D1D = 0,
int T_Q1D = 0>
1054inline void SmemPAMassApply2D(
const int NE,
1055 const Array<real_t> &b_,
1056 const Array<real_t> &bt_,
1063 MFEM_CONTRACT_VAR(bt_);
1064 static constexpr int T_NBZ = mass::NBZ(T_D1D);
1065 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
1066 const int D1D = T_D1D ? T_D1D : d1d;
1067 const int Q1D = T_Q1D ? T_Q1D : q1d;
1070 MFEM_VERIFY(D1D <= max_d1d,
"");
1071 MFEM_VERIFY(Q1D <= max_q1d,
"");
1072 const auto b = b_.Read();
1073 const auto D = d_.Read();
1074 const auto x = x_.Read();
1075 auto Y = y_.ReadWrite();
1078 internal::SmemPAMassApply2D_Element<T_D1D,T_Q1D,T_NBZ>(
1079 e, NE,
b, D, x, Y, d1d, q1d);
1084template<
int T_D1D = 0,
int T_Q1D = 0>
1085inline void PAMassApply3D(
const int NE,
1086 const Array<real_t> &b_,
1087 const Array<real_t> &bt_,
1094 MFEM_VERIFY(T_D1D ? T_D1D : d1d <= DeviceDofQuadLimits::Get().MAX_D1D,
"");
1095 MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= DeviceDofQuadLimits::Get().MAX_Q1D,
"");
1097 const auto B = b_.Read();
1098 const auto Bt = bt_.Read();
1099 const auto D = d_.Read();
1100 const auto X = x_.Read();
1101 auto Y = y_.ReadWrite();
1105 internal::PAMassApply3D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
1110template<
int T_D1D = 0,
int T_Q1D = 0>
1111inline void SmemPAMassApply3D(
const int NE,
1112 const Array<real_t> &b_,
1113 const Array<real_t> &bt_,
1120 MFEM_CONTRACT_VAR(bt_);
1121 const int D1D = T_D1D ? T_D1D : d1d;
1122 const int Q1D = T_Q1D ? T_Q1D : q1d;
1125 MFEM_VERIFY(D1D <= max_d1d,
"");
1126 MFEM_VERIFY(Q1D <= max_q1d,
"");
1130 auto y = y_.ReadWrite();
1133 internal::SmemPAMassApply3D_Element<T_D1D,T_Q1D>(e, NE,
b, d, x, y, d1d, q1d);
1137template<
int T_D1D = 0,
int T_Q1D = 0>
1138inline void EAMassAssemble1D(
const int NE,
1139 const Array<real_t> &basis,
1140 const Vector &padata,
1146 const int D1D = T_D1D ? T_D1D : d1d;
1147 const int Q1D = T_Q1D ? T_Q1D : q1d;
1150 auto B =
Reshape(basis.Read(), Q1D, D1D);
1151 auto D =
Reshape(padata.Read(), Q1D, NE);
1152 auto M =
Reshape(
add ? eadata.ReadWrite() : eadata.
Write(), D1D, D1D, NE);
1155 const int D1D = T_D1D ? T_D1D : d1d;
1156 const int Q1D = T_Q1D ? T_Q1D : q1d;
1157 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1158 MFEM_FOREACH_THREAD(i1,x,D1D)
1161 for (
int q = 0; q < Q1D; q++) { r_Bi[q] = B(q,i1); }
1162 MFEM_FOREACH_THREAD(j1,y,D1D)
1165 for (
int q = 0; q < Q1D; q++) { r_Bj[q] = B(q,j1); }
1168 for (
int k1 = 0; k1 < Q1D; ++k1)
1170 val += r_Bi[k1] * r_Bj[k1] * D(k1, e);
1174 M(i1, j1, e) += val;
1185template<
int T_D1D = 0,
int T_Q1D = 0>
1186inline void EAMassAssemble2D(
const int NE,
1187 const Array<real_t> &basis,
1188 const Vector &padata,
1194 const int D1D = T_D1D ? T_D1D : d1d;
1195 const int Q1D = T_Q1D ? T_Q1D : q1d;
1198 auto B =
Reshape(basis.Read(), Q1D, D1D);
1199 auto D =
Reshape(padata.Read(), Q1D, Q1D, NE);
1200 auto M =
Reshape(
add ? eadata.ReadWrite() : eadata.
Write(), D1D, D1D, D1D, D1D,
1204 const int D1D = T_D1D ? T_D1D : d1d;
1205 const int Q1D = T_Q1D ? T_Q1D : q1d;
1206 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1207 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1209 for (
int d = 0; d < D1D; d++)
1211 for (
int q = 0; q < Q1D; q++)
1216 MFEM_SHARED
real_t s_D[MQ1][MQ1];
1217 MFEM_FOREACH_THREAD(k1,x,Q1D)
1219 MFEM_FOREACH_THREAD(k2,y,Q1D)
1221 s_D[k1][k2] = D(k1,k2,e);
1225 MFEM_FOREACH_THREAD(i1,x,D1D)
1227 MFEM_FOREACH_THREAD(i2,y,D1D)
1229 for (
int j1 = 0; j1 < D1D; ++j1)
1231 for (
int j2 = 0; j2 < D1D; ++j2)
1234 for (
int k1 = 0; k1 < Q1D; ++k1)
1236 for (
int k2 = 0; k2 < Q1D; ++k2)
1238 val += r_B[k1][i1] * r_B[k1][j1]
1239 * r_B[k2][i2] * r_B[k2][j2]
1245 M(i1, i2, j1, j2, e) += val;
1249 M(i1, i2, j1, j2, e) = val;
1258template<
int T_D1D = 0,
int T_Q1D = 0>
1259inline void EAMassAssemble3D(
const int NE,
1260 const Array<real_t> &basis,
1261 const Vector &padata,
1267 const int D1D = T_D1D ? T_D1D : d1d;
1268 const int Q1D = T_Q1D ? T_Q1D : q1d;
1271 auto B =
Reshape(basis.Read(), Q1D, D1D);
1272 auto D =
Reshape(padata.Read(), Q1D, Q1D, Q1D, NE);
1273 auto M =
Reshape(
add ? eadata.ReadWrite() : eadata.
Write(), D1D, D1D, D1D, D1D,
1277 const int D1D = T_D1D ? T_D1D : d1d;
1278 const int Q1D = T_Q1D ? T_Q1D : q1d;
1279 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1280 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1281 constexpr int DQ = T_D1D * T_Q1D;
1285 constexpr bool USE_REG = DQ != 0 && DQ <= 12;
1286 constexpr int MD1r = USE_REG ? MD1 : 1;
1287 constexpr int MQ1r = USE_REG ? MQ1 : 1;
1288 constexpr int MD1s = USE_REG ? 1 : MD1;
1289 constexpr int MQ1s = USE_REG ? 1 : MQ1;
1291 MFEM_SHARED
real_t s_B[MQ1s][MD1s];
1293 real_t (*l_B)[MD1] =
nullptr;
1296 for (
int d = 0; d < D1D; d++)
1298 for (
int q = 0; q < Q1D; q++)
1303 l_B = (
real_t (*)[MD1])r_B;
1307 if (MFEM_THREAD_ID(z) == 0)
1309 MFEM_FOREACH_THREAD(d,x,D1D)
1311 MFEM_FOREACH_THREAD(q,y,Q1D)
1317 l_B = (
real_t (*)[MD1])s_B;
1320 MFEM_SHARED
real_t s_D[MQ1][MQ1][MQ1];
1321 MFEM_FOREACH_THREAD(k1,x,Q1D)
1323 MFEM_FOREACH_THREAD(k2,y,Q1D)
1325 MFEM_FOREACH_THREAD(k3,z,Q1D)
1327 s_D[k1][k2][k3] = D(k1,k2,k3,e);
1332 MFEM_FOREACH_THREAD(i1,x,D1D)
1334 MFEM_FOREACH_THREAD(i2,y,D1D)
1336 MFEM_FOREACH_THREAD(i3,z,D1D)
1338 for (
int j1 = 0; j1 < D1D; ++j1)
1340 for (
int j2 = 0; j2 < D1D; ++j2)
1342 for (
int j3 = 0; j3 < D1D; ++j3)
1345 for (
int k1 = 0; k1 < Q1D; ++k1)
1347 for (
int k2 = 0; k2 < Q1D; ++k2)
1349 for (
int k3 = 0; k3 < Q1D; ++k3)
1351 val += l_B[k1][i1] * l_B[k1][j1]
1352 * l_B[k2][i2] * l_B[k2][j2]
1353 * l_B[k3][i3] * l_B[k3][j3]
1360 M(i1, i2, i3, j1, j2, j3, e) += val;
1364 M(i1, i2, i3, j1, j2, j3, e) = val;
1383template<
int DIM,
int T_D1D,
int T_Q1D>
1384ApplyKernelType MassIntegrator::ApplyPAKernels::Kernel()
1386 if (
DIM == 1) {
return internal::PAMassApply1D; }
1387 else if (
DIM == 2) {
return internal::SmemPAMassApply2D<T_D1D,T_Q1D>; }
1388 else if (
DIM == 3) {
return internal::SmemPAMassApply3D<T_D1D, T_Q1D>; }
1389 else { MFEM_ABORT(
""); }
1392inline ApplyKernelType MassIntegrator::ApplyPAKernels::Fallback(
1395 if (
DIM == 1) {
return internal::PAMassApply1D; }
1396 else if (
DIM == 2) {
return internal::PAMassApply2D; }
1397 else if (
DIM == 3) {
return internal::PAMassApply3D; }
1398 else { MFEM_ABORT(
""); }
1401template<
int DIM,
int T_D1D,
int T_Q1D>
1402DiagonalKernelType MassIntegrator::DiagonalPAKernels::Kernel()
1404 if (
DIM == 1) {
return internal::PAMassAssembleDiagonal1D; }
1405 else if (
DIM == 2) {
return internal::SmemPAMassAssembleDiagonal2D<T_D1D,T_Q1D>; }
1406 else if (
DIM == 3) {
return internal::SmemPAMassAssembleDiagonal3D<T_D1D, T_Q1D>; }
1407 else { MFEM_ABORT(
""); }
1410inline DiagonalKernelType MassIntegrator::DiagonalPAKernels::Fallback(
1413 if (
DIM == 1) {
return internal::PAMassAssembleDiagonal1D; }
1414 else if (
DIM == 2) {
return internal::PAMassAssembleDiagonal2D; }
1415 else if (
DIM == 3) {
return internal::PAMassAssembleDiagonal3D; }
1416 else { MFEM_ABORT(
""); }
void(*)(const int, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
void(*)(const int, const Array< real_t > &, const Vector &, Vector &, const int, const int) DiagonalKernelType
real_t u(const Vector &xvec)
DeviceTensor< 2, real_t > DeviceMatrix
T * Write(Memory< T > &mem, int size, bool on_dev=true)
Get a pointer for write access to mem with the mfem::Device's DeviceMemoryClass, if on_dev = true,...
void add(const Vector &v1, const Vector &v2, Vector &v)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
DeviceTensor< 2, const real_t > ConstDeviceMatrix
DeviceTensor< 3, const real_t > ConstDeviceCube
void forall_2D(int N, int X, int Y, lambda &&body)
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
DeviceTensor< 3, real_t > DeviceCube
void forall(int N, lambda &&body)
real_t p(const Vector &x, real_t t)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
int MAX_D1D
Maximum number of 1D nodal points.
int MAX_Q1D
Maximum number of 1D quadrature points.