12#ifndef MFEM_BILININTEG_MASS_KERNELS_HPP
13#define MFEM_BILININTEG_MASS_KERNELS_HPP
31static void PAMassAssembleDiagonal1D(
const int NE,
32 const Array<real_t> &
b,
38 auto B =
Reshape(
b.Read(), Q1D, D1D);
39 auto D =
Reshape(d.Read(), Q1D, NE);
40 auto Y =
Reshape(y.ReadWrite(), D1D, NE);
43 for (
int dx = 0; dx < D1D; ++dx)
45 for (
int qx = 0; qx < Q1D; ++qx)
47 Y(dx, e) += B(qx, dx) * B(qx, dx) * D(qx, e);
53template <
bool ACCUMULATE = true>
54MFEM_HOST_DEVICE
inline
55void PAMassApply1D_Element(
const int e,
75 for (
int dx = 0; dx < D1D; ++dx)
81 real_t XQ[DofQuadLimits::MAX_Q1D];
82 for (
int qx = 0; qx < Q1D; ++qx)
86 for (
int dx = 0; dx < D1D; ++dx)
89 for (
int qx = 0; qx < Q1D; ++qx)
94 for (
int qx = 0; qx < Q1D; ++qx)
96 const double q = XQ[qx]*D(qx,e);
97 for (
int dx = 0; dx < D1D; ++dx)
99 Y(dx,e) += Bt(dx,qx) * q;
105static void PAMassApply1D(
const int NE,
106 const Array<real_t> &b_,
107 const Array<real_t> &bt_,
117 const auto B = b_.Read();
118 const auto Bt = bt_.Read();
119 const auto D = d_.Read();
120 const auto X = x_.Read();
121 auto Y = y_.ReadWrite();
125 internal::PAMassApply1D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
130template<
int T_D1D = 0,
int T_Q1D = 0>
131inline void PAMassAssembleDiagonal2D(
const int NE,
132 const Array<real_t> &
b,
138 const int D1D = T_D1D ? T_D1D : d1d;
139 const int Q1D = T_Q1D ? T_Q1D : q1d;
142 auto B =
Reshape(
b.Read(), Q1D, D1D);
143 auto D =
Reshape(d.Read(), Q1D, Q1D, NE);
144 auto Y =
Reshape(y.ReadWrite(), D1D, D1D, NE);
147 const int D1D = T_D1D ? T_D1D : d1d;
148 const int Q1D = T_Q1D ? T_Q1D : q1d;
149 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
150 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
152 for (
int qx = 0; qx < Q1D; ++qx)
154 for (
int dy = 0; dy < D1D; ++dy)
157 for (
int qy = 0; qy < Q1D; ++qy)
159 QD[qx][dy] += B(qy, dy) * B(qy, dy) * D(qx, qy, e);
163 for (
int dy = 0; dy < D1D; ++dy)
165 for (
int dx = 0; dx < D1D; ++dx)
167 for (
int qx = 0; qx < Q1D; ++qx)
169 Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD[qx][dy];
178constexpr int ipow(
int x,
int p) {
return p == 0 ? 1 : x*ipow(x,
p-1); }
179constexpr int D(
int D1D) {
return (11 - D1D) / 2; }
180constexpr int NBZ(
int D1D)
182 return ipow(2, D(D1D) >= 0 ? D(D1D) : 0);
187template<
int T_D1D = 0,
int T_Q1D = 0>
188inline void SmemPAMassAssembleDiagonal2D(
const int NE,
189 const Array<real_t> &b_,
195 static constexpr int T_NBZ = mass::NBZ(T_D1D);
196 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
197 const int D1D = T_D1D ? T_D1D : d1d;
198 const int Q1D = T_Q1D ? T_Q1D : q1d;
201 MFEM_VERIFY(D1D <= max_d1d,
"");
202 MFEM_VERIFY(Q1D <= max_q1d,
"");
203 auto b =
Reshape(b_.Read(), Q1D, D1D);
204 auto D =
Reshape(d_.Read(), Q1D, Q1D, NE);
205 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
208 const int tidz = MFEM_THREAD_ID(z);
209 const int D1D = T_D1D ? T_D1D : d1d;
210 const int Q1D = T_Q1D ? T_Q1D : q1d;
211 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
212 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
213 MFEM_SHARED
real_t B[MQ1][MD1];
214 MFEM_SHARED
real_t QDZ[NBZ][MQ1][MD1];
218 MFEM_FOREACH_THREAD(d,y,D1D)
220 MFEM_FOREACH_THREAD(q,x,Q1D)
227 MFEM_FOREACH_THREAD(qx,x,Q1D)
229 MFEM_FOREACH_THREAD(dy,y,D1D)
232 for (
int qy = 0; qy < Q1D; ++qy)
234 QD[qx][dy] += B[qy][dy] * B[qy][dy] * D(qx, qy, e);
239 MFEM_FOREACH_THREAD(dy,y,D1D)
241 MFEM_FOREACH_THREAD(dx,x,D1D)
243 for (
int qx = 0; qx < Q1D; ++qx)
246 Y(dx,dy,e) += B[qx][dx] * B[qx][dx] * QD[qx][dy];
254template<
int T_D1D = 0,
int T_Q1D = 0>
255inline void PAMassAssembleDiagonal3D(
const int NE,
256 const Array<real_t> &
b,
262 const int D1D = T_D1D ? T_D1D : d1d;
263 const int Q1D = T_Q1D ? T_Q1D : q1d;
266 auto B =
Reshape(
b.Read(), Q1D, D1D);
267 auto D =
Reshape(d.Read(), Q1D, Q1D, Q1D, NE);
268 auto Y =
Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
271 const int D1D = T_D1D ? T_D1D : d1d;
272 const int Q1D = T_Q1D ? T_Q1D : q1d;
273 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
274 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
275 real_t QQD[MQ1][MQ1][MD1];
276 real_t QDD[MQ1][MD1][MD1];
277 for (
int qx = 0; qx < Q1D; ++qx)
279 for (
int qy = 0; qy < Q1D; ++qy)
281 for (
int dz = 0; dz < D1D; ++dz)
283 QQD[qx][qy][dz] = 0.0;
284 for (
int qz = 0; qz < Q1D; ++qz)
286 QQD[qx][qy][dz] += B(qz, dz) * B(qz, dz) * D(qx, qy, qz, e);
291 for (
int qx = 0; qx < Q1D; ++qx)
293 for (
int dz = 0; dz < D1D; ++dz)
295 for (
int dy = 0; dy < D1D; ++dy)
297 QDD[qx][dy][dz] = 0.0;
298 for (
int qy = 0; qy < Q1D; ++qy)
300 QDD[qx][dy][dz] += B(qy, dy) * B(qy, dy) * QQD[qx][qy][dz];
305 for (
int dz = 0; dz < D1D; ++dz)
307 for (
int dy = 0; dy < D1D; ++dy)
309 for (
int dx = 0; dx < D1D; ++dx)
312 for (
int qx = 0; qx < Q1D; ++qx)
314 t += B(qx, dx) * B(qx, dx) * QDD[qx][dy][dz];
316 Y(dx, dy, dz, e) += t;
324template<
int T_D1D = 0,
int T_Q1D = 0>
325inline void SmemPAMassAssembleDiagonal3D(
const int NE,
326 const Array<real_t> &b_,
332 const int D1D = T_D1D ? T_D1D : d1d;
333 const int Q1D = T_Q1D ? T_Q1D : q1d;
336 MFEM_VERIFY(D1D <= max_d1d,
"");
337 MFEM_VERIFY(Q1D <= max_q1d,
"");
338 auto b =
Reshape(b_.Read(), Q1D, D1D);
339 auto D =
Reshape(d_.Read(), Q1D, Q1D, Q1D, NE);
340 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
343 const int tidz = MFEM_THREAD_ID(z);
344 const int D1D = T_D1D ? T_D1D : d1d;
345 const int Q1D = T_Q1D ? T_Q1D : q1d;
346 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
347 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
348 MFEM_SHARED
real_t B[MQ1][MD1];
349 MFEM_SHARED
real_t QQD[MQ1][MQ1][MD1];
350 MFEM_SHARED
real_t QDD[MQ1][MD1][MD1];
353 MFEM_FOREACH_THREAD(d,y,D1D)
355 MFEM_FOREACH_THREAD(q,x,Q1D)
362 MFEM_FOREACH_THREAD(qx,x,Q1D)
364 MFEM_FOREACH_THREAD(qy,y,Q1D)
366 MFEM_FOREACH_THREAD(dz,z,D1D)
368 QQD[qx][qy][dz] = 0.0;
369 for (
int qz = 0; qz < Q1D; ++qz)
371 QQD[qx][qy][dz] += B[qz][dz] * B[qz][dz] * D(qx, qy, qz, e);
377 MFEM_FOREACH_THREAD(qx,x,Q1D)
379 MFEM_FOREACH_THREAD(dz,z,D1D)
381 MFEM_FOREACH_THREAD(dy,y,D1D)
383 QDD[qx][dy][dz] = 0.0;
384 for (
int qy = 0; qy < Q1D; ++qy)
386 QDD[qx][dy][dz] += B[qy][dy] * B[qy][dy] * QQD[qx][qy][dz];
392 MFEM_FOREACH_THREAD(dz,z,D1D)
394 MFEM_FOREACH_THREAD(dy,y,D1D)
396 MFEM_FOREACH_THREAD(dx,x,D1D)
399 for (
int qx = 0; qx < Q1D; ++qx)
401 t += B[qx][dx] * B[qx][dx] * QDD[qx][dy][dz];
403 Y(dx, dy, dz, e) += t;
412void OccaPAMassApply2D(
const int D1D,
415 const Array<real_t> &B,
416 const Array<real_t> &Bt,
422void OccaPAMassApply3D(
const int D1D,
425 const Array<real_t> &B,
426 const Array<real_t> &Bt,
432template <
bool ACCUMULATE = true>
433MFEM_HOST_DEVICE
inline
434void PAMassApply2D_Element(
const int e,
454 for (
int dy = 0; dy < D1D; ++dy)
456 for (
int dx = 0; dx < D1D; ++dx)
463 constexpr int max_D1D = DofQuadLimits::MAX_D1D;
464 constexpr int max_Q1D = DofQuadLimits::MAX_Q1D;
465 real_t sol_xy[max_Q1D][max_Q1D];
466 for (
int qy = 0; qy < Q1D; ++qy)
468 for (
int qx = 0; qx < Q1D; ++qx)
470 sol_xy[qy][qx] = 0.0;
473 for (
int dy = 0; dy < D1D; ++dy)
476 for (
int qy = 0; qy < Q1D; ++qy)
480 for (
int dx = 0; dx < D1D; ++dx)
482 const real_t s = X(dx,dy,e);
483 for (
int qx = 0; qx < Q1D; ++qx)
485 sol_x[qx] += B(qx,dx)* s;
488 for (
int qy = 0; qy < Q1D; ++qy)
490 const real_t d2q = B(qy,dy);
491 for (
int qx = 0; qx < Q1D; ++qx)
493 sol_xy[qy][qx] += d2q * sol_x[qx];
497 for (
int qy = 0; qy < Q1D; ++qy)
499 for (
int qx = 0; qx < Q1D; ++qx)
501 sol_xy[qy][qx] *= D(qx,qy,e);
504 for (
int qy = 0; qy < Q1D; ++qy)
507 for (
int dx = 0; dx < D1D; ++dx)
511 for (
int qx = 0; qx < Q1D; ++qx)
513 const real_t s = sol_xy[qy][qx];
514 for (
int dx = 0; dx < D1D; ++dx)
516 sol_x[dx] += Bt(dx,qx) * s;
519 for (
int dy = 0; dy < D1D; ++dy)
521 const real_t q2d = Bt(dy,qy);
522 for (
int dx = 0; dx < D1D; ++dx)
524 Y(dx,dy,e) += q2d * sol_x[dx];
530template<
int T_D1D,
int T_Q1D,
int T_NBZ,
bool ACCUMULATE = true>
531MFEM_HOST_DEVICE
inline
532void SmemPAMassApply2D_Element(
const int e,
541 const int D1D = T_D1D ? T_D1D : d1d;
542 const int Q1D = T_Q1D ? T_Q1D : q1d;
543 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
545 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
546 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
547 constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
554 const int tidz = MFEM_THREAD_ID(z);
556 MFEM_SHARED
real_t BBt[MQ1*MD1];
559 MFEM_SHARED
real_t sm0[NBZ][MDQ*MDQ];
560 MFEM_SHARED
real_t sm1[NBZ][MDQ*MDQ];
567 MFEM_FOREACH_THREAD(dy,y,D1D)
569 MFEM_FOREACH_THREAD(dx,x,D1D)
571 X[dy][dx] = x(dx,dy,e);
576 MFEM_FOREACH_THREAD(dy,y,D1D)
578 MFEM_FOREACH_THREAD(q,x,Q1D)
585 MFEM_FOREACH_THREAD(dy,y,D1D)
587 MFEM_FOREACH_THREAD(qx,x,Q1D)
590 for (
int dx = 0; dx < D1D; ++dx)
592 dq += X[dy][dx] * B[qx][dx];
598 MFEM_FOREACH_THREAD(qy,y,Q1D)
600 MFEM_FOREACH_THREAD(qx,x,Q1D)
603 for (
int dy = 0; dy < D1D; ++dy)
605 qq += DQ[dy][qx] * B[qy][dy];
607 QQ[qy][qx] = qq * D(qx, qy, e);
613 MFEM_FOREACH_THREAD(dy,y,D1D)
615 MFEM_FOREACH_THREAD(q,x,Q1D)
622 MFEM_FOREACH_THREAD(qy,y,Q1D)
624 MFEM_FOREACH_THREAD(dx,x,D1D)
627 for (
int qx = 0; qx < Q1D; ++qx)
629 dq += QQ[qy][qx] * Bt[dx][qx];
635 MFEM_FOREACH_THREAD(dy,y,D1D)
637 MFEM_FOREACH_THREAD(dx,x,D1D)
640 for (
int qy = 0; qy < Q1D; ++qy)
642 dd += (QD[qy][dx] * Bt[dy][qy]);
656template <
bool ACCUMULATE = true>
657MFEM_HOST_DEVICE
inline
658void PAMassApply3D_Element(
const int e,
672 auto D = DeviceTensor<4,const real_t>(d_, Q1D, Q1D, Q1D, NE);
673 auto X = DeviceTensor<4,const real_t>(x_, D1D, D1D, D1D, NE);
674 auto Y = DeviceTensor<4,real_t>(y_, D1D, D1D, D1D, NE);
678 for (
int dz = 0; dz < D1D; ++dz)
680 for (
int dy = 0; dy < D1D; ++dy)
682 for (
int dx = 0; dx < D1D; ++dx)
684 Y(dx, dy, dz, e) = 0.0;
690 constexpr int max_D1D = DofQuadLimits::MAX_D1D;
691 constexpr int max_Q1D = DofQuadLimits::MAX_Q1D;
692 real_t sol_xyz[max_Q1D][max_Q1D][max_Q1D];
693 for (
int qz = 0; qz < Q1D; ++qz)
695 for (
int qy = 0; qy < Q1D; ++qy)
697 for (
int qx = 0; qx < Q1D; ++qx)
699 sol_xyz[qz][qy][qx] = 0.0;
703 for (
int dz = 0; dz < D1D; ++dz)
705 real_t sol_xy[max_Q1D][max_Q1D];
706 for (
int qy = 0; qy < Q1D; ++qy)
708 for (
int qx = 0; qx < Q1D; ++qx)
710 sol_xy[qy][qx] = 0.0;
713 for (
int dy = 0; dy < D1D; ++dy)
716 for (
int qx = 0; qx < Q1D; ++qx)
720 for (
int dx = 0; dx < D1D; ++dx)
722 const real_t s = X(dx,dy,dz,e);
723 for (
int qx = 0; qx < Q1D; ++qx)
725 sol_x[qx] += B(qx,dx) * s;
728 for (
int qy = 0; qy < Q1D; ++qy)
730 const real_t wy = B(qy,dy);
731 for (
int qx = 0; qx < Q1D; ++qx)
733 sol_xy[qy][qx] += wy * sol_x[qx];
737 for (
int qz = 0; qz < Q1D; ++qz)
739 const real_t wz = B(qz,dz);
740 for (
int qy = 0; qy < Q1D; ++qy)
742 for (
int qx = 0; qx < Q1D; ++qx)
744 sol_xyz[qz][qy][qx] += wz * sol_xy[qy][qx];
749 for (
int qz = 0; qz < Q1D; ++qz)
751 for (
int qy = 0; qy < Q1D; ++qy)
753 for (
int qx = 0; qx < Q1D; ++qx)
755 sol_xyz[qz][qy][qx] *= D(qx,qy,qz,e);
759 for (
int qz = 0; qz < Q1D; ++qz)
761 real_t sol_xy[max_D1D][max_D1D];
762 for (
int dy = 0; dy < D1D; ++dy)
764 for (
int dx = 0; dx < D1D; ++dx)
769 for (
int qy = 0; qy < Q1D; ++qy)
772 for (
int dx = 0; dx < D1D; ++dx)
776 for (
int qx = 0; qx < Q1D; ++qx)
778 const real_t s = sol_xyz[qz][qy][qx];
779 for (
int dx = 0; dx < D1D; ++dx)
781 sol_x[dx] += Bt(dx,qx) * s;
784 for (
int dy = 0; dy < D1D; ++dy)
786 const real_t wy = Bt(dy,qy);
787 for (
int dx = 0; dx < D1D; ++dx)
789 sol_xy[dy][dx] += wy * sol_x[dx];
793 for (
int dz = 0; dz < D1D; ++dz)
795 const real_t wz = Bt(dz,qz);
796 for (
int dy = 0; dy < D1D; ++dy)
798 for (
int dx = 0; dx < D1D; ++dx)
800 Y(dx,dy,dz,e) += wz * sol_xy[dy][dx];
807template<
int T_D1D,
int T_Q1D,
bool ACCUMULATE = true>
808MFEM_HOST_DEVICE
inline
809void SmemPAMassApply3D_Element(
const int e,
818 constexpr int D1D = T_D1D ? T_D1D : d1d;
819 constexpr int Q1D = T_Q1D ? T_Q1D : q1d;
820 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
821 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
822 constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
825 auto d = DeviceTensor<4,const real_t>(d_, Q1D, Q1D, Q1D, NE);
826 auto x = DeviceTensor<4,const real_t>(x_, D1D, D1D, D1D, NE);
827 auto y = DeviceTensor<4,real_t>(y_, D1D, D1D, D1D, NE);
829 MFEM_SHARED
real_t sDQ[MQ1*MD1];
832 MFEM_SHARED
real_t sm0[MDQ*MDQ*MDQ];
833 MFEM_SHARED
real_t sm1[MDQ*MDQ*MDQ];
840 MFEM_FOREACH_THREAD(dy,y,D1D)
842 MFEM_FOREACH_THREAD(dx,x,D1D)
845 for (
int dz = 0; dz < D1D; ++dz)
847 X[dz][dy][dx] = x(dx,dy,dz,e);
850 MFEM_FOREACH_THREAD(dx,x,Q1D)
852 B[dx][dy] =
b(dx,dy);
856 MFEM_FOREACH_THREAD(dy,y,D1D)
858 MFEM_FOREACH_THREAD(qx,x,Q1D)
862 for (
int dz = 0; dz < D1D; dz++)
867 for (
int dx = 0; dx < D1D; ++dx)
870 for (
int dz = 0; dz < D1D; ++dz)
872 u[dz] += X[dz][dy][dx] * B[qx][dx];
876 for (
int dz = 0; dz < D1D; ++dz)
878 DDQ[dz][dy][qx] =
u[dz];
883 MFEM_FOREACH_THREAD(qy,y,Q1D)
885 MFEM_FOREACH_THREAD(qx,x,Q1D)
889 for (
int dz = 0; dz < D1D; dz++)
894 for (
int dy = 0; dy < D1D; ++dy)
897 for (
int dz = 0; dz < D1D; dz++)
899 u[dz] += DDQ[dz][dy][qx] * B[qy][dy];
903 for (
int dz = 0; dz < D1D; dz++)
905 DQQ[dz][qy][qx] =
u[dz];
910 MFEM_FOREACH_THREAD(qy,y,Q1D)
912 MFEM_FOREACH_THREAD(qx,x,Q1D)
916 for (
int qz = 0; qz < Q1D; qz++)
921 for (
int dz = 0; dz < D1D; ++dz)
924 for (
int qz = 0; qz < Q1D; qz++)
926 u[qz] += DQQ[dz][qy][qx] * B[qz][dz];
930 for (
int qz = 0; qz < Q1D; qz++)
932 QQQ[qz][qy][qx] =
u[qz] * d(qx,qy,qz,e);
937 MFEM_FOREACH_THREAD(di,y,D1D)
939 MFEM_FOREACH_THREAD(q,x,Q1D)
945 MFEM_FOREACH_THREAD(qy,y,Q1D)
947 MFEM_FOREACH_THREAD(dx,x,D1D)
951 for (
int qz = 0; qz < Q1D; ++qz)
956 for (
int qx = 0; qx < Q1D; ++qx)
959 for (
int qz = 0; qz < Q1D; ++qz)
961 u[qz] += QQQ[qz][qy][qx] * Bt[dx][qx];
965 for (
int qz = 0; qz < Q1D; ++qz)
967 QQD[qz][qy][dx] =
u[qz];
972 MFEM_FOREACH_THREAD(dy,y,D1D)
974 MFEM_FOREACH_THREAD(dx,x,D1D)
978 for (
int qz = 0; qz < Q1D; ++qz)
983 for (
int qy = 0; qy < Q1D; ++qy)
986 for (
int qz = 0; qz < Q1D; ++qz)
988 u[qz] += QQD[qz][qy][dx] * Bt[dy][qy];
992 for (
int qz = 0; qz < Q1D; ++qz)
994 QDD[qz][dy][dx] =
u[qz];
999 MFEM_FOREACH_THREAD(dy,y,D1D)
1001 MFEM_FOREACH_THREAD(dx,x,D1D)
1005 for (
int dz = 0; dz < D1D; ++dz)
1010 for (
int qz = 0; qz < Q1D; ++qz)
1013 for (
int dz = 0; dz < D1D; ++dz)
1015 u[dz] += QDD[qz][dy][dx] * Bt[dz][qz];
1019 for (
int dz = 0; dz < D1D; ++dz)
1023 y(dx,dy,dz,e) +=
u[dz];
1027 y(dx,dy,dz,e) =
u[dz];
1036template<
int T_D1D = 0,
int T_Q1D = 0>
1037inline void PAMassApply2D(
const int NE,
1038 const Array<real_t> &b_,
1039 const Array<real_t> &bt_,
1046 MFEM_VERIFY(T_D1D ? T_D1D : d1d <= DeviceDofQuadLimits::Get().MAX_D1D,
"");
1047 MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= DeviceDofQuadLimits::Get().MAX_Q1D,
"");
1049 const auto B = b_.Read();
1050 const auto Bt = bt_.Read();
1051 const auto D = d_.Read();
1052 const auto X = x_.Read();
1053 auto Y = y_.ReadWrite();
1057 internal::PAMassApply2D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
1062template<
int T_D1D = 0,
int T_Q1D = 0>
1063inline void SmemPAMassApply2D(
const int NE,
1064 const Array<real_t> &b_,
1065 const Array<real_t> &bt_,
1072 MFEM_CONTRACT_VAR(bt_);
1073 static constexpr int T_NBZ = mass::NBZ(T_D1D);
1074 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
1075 const int D1D = T_D1D ? T_D1D : d1d;
1076 const int Q1D = T_Q1D ? T_Q1D : q1d;
1079 MFEM_VERIFY(D1D <= max_d1d,
"");
1080 MFEM_VERIFY(Q1D <= max_q1d,
"");
1081 const auto b = b_.Read();
1082 const auto D = d_.Read();
1083 const auto x = x_.Read();
1084 auto Y = y_.ReadWrite();
1087 internal::SmemPAMassApply2D_Element<T_D1D,T_Q1D,T_NBZ>(
1088 e, NE,
b, D, x, Y, d1d, q1d);
1093template<
int T_D1D = 0,
int T_Q1D = 0>
1094inline void PAMassApply3D(
const int NE,
1095 const Array<real_t> &b_,
1096 const Array<real_t> &bt_,
1103 MFEM_VERIFY(T_D1D ? T_D1D : d1d <= DeviceDofQuadLimits::Get().MAX_D1D,
"");
1104 MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= DeviceDofQuadLimits::Get().MAX_Q1D,
"");
1106 const auto B = b_.Read();
1107 const auto Bt = bt_.Read();
1108 const auto D = d_.Read();
1109 const auto X = x_.Read();
1110 auto Y = y_.ReadWrite();
1114 internal::PAMassApply3D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
1119template<
int T_D1D = 0,
int T_Q1D = 0>
1120inline void SmemPAMassApply3D(
const int NE,
1121 const Array<real_t> &b_,
1122 const Array<real_t> &bt_,
1129 MFEM_CONTRACT_VAR(bt_);
1130 const int D1D = T_D1D ? T_D1D : d1d;
1131 const int Q1D = T_Q1D ? T_Q1D : q1d;
1134 MFEM_VERIFY(D1D <= max_d1d,
"");
1135 MFEM_VERIFY(Q1D <= max_q1d,
"");
1139 auto y = y_.ReadWrite();
1142 internal::SmemPAMassApply3D_Element<T_D1D,T_Q1D>(e, NE,
b, d, x, y, d1d, q1d);
1146template<
int T_D1D = 0,
int T_Q1D = 0>
1147inline void EAMassAssemble1D(
const int NE,
1148 const Array<real_t> &basis,
1149 const Vector &padata,
1155 const int D1D = T_D1D ? T_D1D : d1d;
1156 const int Q1D = T_Q1D ? T_Q1D : q1d;
1159 auto B =
Reshape(basis.Read(), Q1D, D1D);
1160 auto D =
Reshape(padata.Read(), Q1D, NE);
1161 auto M =
Reshape(
add ? eadata.ReadWrite() : eadata.
Write(), D1D, D1D, NE);
1164 const int D1D = T_D1D ? T_D1D : d1d;
1165 const int Q1D = T_Q1D ? T_Q1D : q1d;
1166 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1167 MFEM_FOREACH_THREAD(i1,x,D1D)
1170 for (
int q = 0; q < Q1D; q++) { r_Bi[q] = B(q,i1); }
1171 MFEM_FOREACH_THREAD(j1,y,D1D)
1174 for (
int q = 0; q < Q1D; q++) { r_Bj[q] = B(q,j1); }
1177 for (
int k1 = 0; k1 < Q1D; ++k1)
1179 val += r_Bi[k1] * r_Bj[k1] * D(k1, e);
1183 M(i1, j1, e) += val;
1194template<
int T_D1D = 0,
int T_Q1D = 0>
1195inline void EAMassAssemble2D(
const int NE,
1196 const Array<real_t> &basis,
1197 const Vector &padata,
1203 const int D1D = T_D1D ? T_D1D : d1d;
1204 const int Q1D = T_Q1D ? T_Q1D : q1d;
1207 auto B =
Reshape(basis.Read(), Q1D, D1D);
1208 auto D =
Reshape(padata.Read(), Q1D, Q1D, NE);
1209 auto M =
Reshape(
add ? eadata.ReadWrite() : eadata.
Write(), D1D, D1D, D1D, D1D,
1213 const int D1D = T_D1D ? T_D1D : d1d;
1214 const int Q1D = T_Q1D ? T_Q1D : q1d;
1215 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1216 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1218 for (
int d = 0; d < D1D; d++)
1220 for (
int q = 0; q < Q1D; q++)
1225 MFEM_SHARED
real_t s_D[MQ1][MQ1];
1226 MFEM_FOREACH_THREAD(k1,x,Q1D)
1228 MFEM_FOREACH_THREAD(k2,y,Q1D)
1230 s_D[k1][k2] = D(k1,k2,e);
1234 MFEM_FOREACH_THREAD(i1,x,D1D)
1236 MFEM_FOREACH_THREAD(i2,y,D1D)
1238 for (
int j1 = 0; j1 < D1D; ++j1)
1240 for (
int j2 = 0; j2 < D1D; ++j2)
1243 for (
int k1 = 0; k1 < Q1D; ++k1)
1245 for (
int k2 = 0; k2 < Q1D; ++k2)
1247 val += r_B[k1][i1] * r_B[k1][j1]
1248 * r_B[k2][i2] * r_B[k2][j2]
1254 M(i1, i2, j1, j2, e) += val;
1258 M(i1, i2, j1, j2, e) = val;
1267template<
int T_D1D = 0,
int T_Q1D = 0>
1268inline void EAMassAssemble3D(
const int NE,
1269 const Array<real_t> &basis,
1270 const Vector &padata,
1276 const int D1D = T_D1D ? T_D1D : d1d;
1277 const int Q1D = T_Q1D ? T_Q1D : q1d;
1280 auto B =
Reshape(basis.Read(), Q1D, D1D);
1281 auto D =
Reshape(padata.Read(), Q1D, Q1D, Q1D, NE);
1282 auto M =
Reshape(
add ? eadata.ReadWrite() : eadata.
Write(), D1D, D1D, D1D, D1D,
1286 const int D1D = T_D1D ? T_D1D : d1d;
1287 const int Q1D = T_Q1D ? T_Q1D : q1d;
1288 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1289 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1290 constexpr int DQ = T_D1D * T_Q1D;
1294 constexpr bool USE_REG = DQ != 0 && DQ <= 12;
1295 constexpr int MD1r = USE_REG ? MD1 : 1;
1296 constexpr int MQ1r = USE_REG ? MQ1 : 1;
1297 constexpr int MD1s = USE_REG ? 1 : MD1;
1298 constexpr int MQ1s = USE_REG ? 1 : MQ1;
1300 MFEM_SHARED
real_t s_B[MQ1s][MD1s];
1302 real_t (*l_B)[MD1] =
nullptr;
1305 for (
int d = 0; d < D1D; d++)
1307 for (
int q = 0; q < Q1D; q++)
1312 l_B = (
real_t (*)[MD1])r_B;
1316 if (MFEM_THREAD_ID(z) == 0)
1318 MFEM_FOREACH_THREAD(d,x,D1D)
1320 MFEM_FOREACH_THREAD(q,y,Q1D)
1326 l_B = (
real_t (*)[MD1])s_B;
1329 MFEM_SHARED
real_t s_D[MQ1][MQ1][MQ1];
1330 MFEM_FOREACH_THREAD(k1,x,Q1D)
1332 MFEM_FOREACH_THREAD(k2,y,Q1D)
1334 MFEM_FOREACH_THREAD(k3,z,Q1D)
1336 s_D[k1][k2][k3] = D(k1,k2,k3,e);
1341 MFEM_FOREACH_THREAD(i1,x,D1D)
1343 MFEM_FOREACH_THREAD(i2,y,D1D)
1345 MFEM_FOREACH_THREAD(i3,z,D1D)
1347 for (
int j1 = 0; j1 < D1D; ++j1)
1349 for (
int j2 = 0; j2 < D1D; ++j2)
1351 for (
int j3 = 0; j3 < D1D; ++j3)
1354 for (
int k1 = 0; k1 < Q1D; ++k1)
1356 for (
int k2 = 0; k2 < Q1D; ++k2)
1358 for (
int k3 = 0; k3 < Q1D; ++k3)
1360 val += l_B[k1][i1] * l_B[k1][j1]
1361 * l_B[k2][i2] * l_B[k2][j2]
1362 * l_B[k3][i3] * l_B[k3][j3]
1369 M(i1, i2, i3, j1, j2, j3, e) += val;
1373 M(i1, i2, i3, j1, j2, j3, e) = val;
1392template<
int DIM,
int T_D1D,
int T_Q1D>
1393ApplyKernelType MassIntegrator::ApplyPAKernels::Kernel()
1395 if constexpr (
DIM == 1) {
return internal::PAMassApply1D; }
1396 else if constexpr (
DIM == 2) {
return internal::SmemPAMassApply2D<T_D1D,T_Q1D>; }
1397 else if constexpr (
DIM == 3) {
return internal::SmemPAMassApply3D<T_D1D, T_Q1D>; }
1401inline ApplyKernelType MassIntegrator::ApplyPAKernels::Fallback(
1404 if (
DIM == 1) {
return internal::PAMassApply1D; }
1405 else if (
DIM == 2) {
return internal::PAMassApply2D; }
1406 else if (
DIM == 3) {
return internal::PAMassApply3D; }
1407 else { MFEM_ABORT(
""); }
1410template<
int DIM,
int T_D1D,
int T_Q1D>
1411DiagonalKernelType MassIntegrator::DiagonalPAKernels::Kernel()
1413 if constexpr (
DIM == 1) {
return internal::PAMassAssembleDiagonal1D; }
1414 else if constexpr (
DIM == 2) {
return internal::SmemPAMassAssembleDiagonal2D<T_D1D,T_Q1D>; }
1415 else if constexpr (
DIM == 3) {
return internal::SmemPAMassAssembleDiagonal3D<T_D1D, T_Q1D>; }
1419inline DiagonalKernelType MassIntegrator::DiagonalPAKernels::Fallback(
1422 if (
DIM == 1) {
return internal::PAMassAssembleDiagonal1D; }
1423 else if (
DIM == 2) {
return internal::PAMassAssembleDiagonal2D; }
1424 else if (
DIM == 3) {
return internal::PAMassAssembleDiagonal3D; }
1425 else { MFEM_ABORT(
""); }
void(*)(const int, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
void(*)(const int, const Array< real_t > &, const Vector &, Vector &, const int, const int) DiagonalKernelType
DeviceTensor< 3, real_t > DeviceCube
real_t u(const Vector &xvec)
DeviceTensor< 3, const real_t > ConstDeviceCube
T * Write(Memory< T > &mem, int size, bool on_dev=true)
Get a pointer for write access to mem with the mfem::Device's DeviceMemoryClass, if on_dev = true,...
void add(const Vector &v1, const Vector &v2, Vector &v)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
void forall_2D(int N, int X, int Y, lambda &&body)
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
DeviceTensor< 2, const real_t > ConstDeviceMatrix
void forall(int N, lambda &&body)
DeviceTensor< 2, real_t > DeviceMatrix
real_t p(const Vector &x, real_t t)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
int MAX_D1D
Maximum number of 1D nodal points.
int MAX_Q1D
Maximum number of 1D quadrature points.