12#ifndef MFEM_BILININTEG_MASS_KERNELS_HPP 
   13#define MFEM_BILININTEG_MASS_KERNELS_HPP 
   31static void PAMassAssembleDiagonal1D(
const int NE,
 
   32                                     const Array<real_t> &
b,
 
   38   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
   39   auto D = 
Reshape(d.Read(), Q1D, NE);
 
   40   auto Y = 
Reshape(y.ReadWrite(), D1D, NE);
 
   43      for (
int dx = 0; dx < D1D; ++dx)
 
   45         for (
int qx = 0; qx < Q1D; ++qx)
 
   47            Y(dx, e) += B(qx, dx) * B(qx, dx) * D(qx, e);
 
   53MFEM_HOST_DEVICE 
inline 
   54void PAMassApply1D_Element(
const int e,
 
   72   real_t XQ[DofQuadLimits::MAX_Q1D];
 
   73   for (
int qx = 0; qx < Q1D; ++qx)
 
   77   for (
int dx = 0; dx < D1D; ++dx)
 
   80      for (
int qx = 0; qx < Q1D; ++qx)
 
   85   for (
int qx = 0; qx < Q1D; ++qx)
 
   87      const double q = XQ[qx]*D(qx,e);
 
   88      for (
int dx = 0; dx < D1D; ++dx)
 
   90         Y(dx,e) += Bt(dx,qx) * q;
 
   96static void PAMassApply1D(
const int NE,
 
   97                          const Array<real_t> &b_,
 
   98                          const Array<real_t> &bt_,
 
  108   const auto B = b_.Read();
 
  109   const auto Bt = bt_.Read();
 
  110   const auto D = d_.Read();
 
  111   const auto X = x_.Read();
 
  112   auto Y = y_.ReadWrite();
 
  116      internal::PAMassApply1D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
 
  121template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  122inline void PAMassAssembleDiagonal2D(
const int NE,
 
  123                                     const Array<real_t> &
b,
 
  129   const int D1D = T_D1D ? T_D1D : d1d;
 
  130   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  133   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  134   auto D = 
Reshape(d.Read(), Q1D, Q1D, NE);
 
  135   auto Y = 
Reshape(y.ReadWrite(), D1D, D1D, NE);
 
  138      const int D1D = T_D1D ? T_D1D : d1d;
 
  139      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  140      constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  141      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  143      for (
int qx = 0; qx < Q1D; ++qx)
 
  145         for (
int dy = 0; dy < D1D; ++dy)
 
  148            for (
int qy = 0; qy < Q1D; ++qy)
 
  150               QD[qx][dy] += B(qy, dy) * B(qy, dy) * D(qx, qy, e);
 
  154      for (
int dy = 0; dy < D1D; ++dy)
 
  156         for (
int dx = 0; dx < D1D; ++dx)
 
  158            for (
int qx = 0; qx < Q1D; ++qx)
 
  160               Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD[qx][dy];
 
  169constexpr int ipow(
int x, 
int p) { 
return p == 0 ? 1 : x*ipow(x, 
p-1); }
 
  170constexpr int D(
int D1D) { 
return (11 - D1D) / 2; }
 
  171constexpr int NBZ(
int D1D)
 
  173   return ipow(2, D(D1D) >= 0 ? D(D1D) : 0);
 
  178template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  179inline void SmemPAMassAssembleDiagonal2D(
const int NE,
 
  180                                         const Array<real_t> &b_,
 
  186   static constexpr int T_NBZ = mass::NBZ(T_D1D);
 
  187   static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
  188   const int D1D = T_D1D ? T_D1D : d1d;
 
  189   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  192   MFEM_VERIFY(D1D <= max_d1d, 
"");
 
  193   MFEM_VERIFY(Q1D <= max_q1d, 
"");
 
  194   auto b = 
Reshape(b_.Read(), Q1D, D1D);
 
  195   auto D = 
Reshape(d_.Read(), Q1D, Q1D, NE);
 
  196   auto Y = 
Reshape(y_.ReadWrite(), D1D, D1D, NE);
 
  199      const int tidz = MFEM_THREAD_ID(z);
 
  200      const int D1D = T_D1D ? T_D1D : d1d;
 
  201      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  202      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  203      constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  204      MFEM_SHARED 
real_t B[MQ1][MD1];
 
  205      MFEM_SHARED 
real_t QDZ[NBZ][MQ1][MD1];
 
  209         MFEM_FOREACH_THREAD(d,y,D1D)
 
  211            MFEM_FOREACH_THREAD(q,x,Q1D)
 
  218      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  220         MFEM_FOREACH_THREAD(dy,y,D1D)
 
  223            for (
int qy = 0; qy < Q1D; ++qy)
 
  225               QD[qx][dy] += B[qy][dy] * B[qy][dy] * D(qx, qy, e);
 
  230      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  232         MFEM_FOREACH_THREAD(dx,x,D1D)
 
  234            for (
int qx = 0; qx < Q1D; ++qx)
 
  237               Y(dx,dy,e) += B[qx][dx] * B[qx][dx] * QD[qx][dy];
 
  245template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  246inline void PAMassAssembleDiagonal3D(
const int NE,
 
  247                                     const Array<real_t> &
b,
 
  253   const int D1D = T_D1D ? T_D1D : d1d;
 
  254   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  257   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  258   auto D = 
Reshape(d.Read(), Q1D, Q1D, Q1D, NE);
 
  259   auto Y = 
Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
 
  262      const int D1D = T_D1D ? T_D1D : d1d;
 
  263      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  264      constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  265      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  266      real_t QQD[MQ1][MQ1][MD1];
 
  267      real_t QDD[MQ1][MD1][MD1];
 
  268      for (
int qx = 0; qx < Q1D; ++qx)
 
  270         for (
int qy = 0; qy < Q1D; ++qy)
 
  272            for (
int dz = 0; dz < D1D; ++dz)
 
  274               QQD[qx][qy][dz] = 0.0;
 
  275               for (
int qz = 0; qz < Q1D; ++qz)
 
  277                  QQD[qx][qy][dz] += B(qz, dz) * B(qz, dz) * D(qx, qy, qz, e);
 
  282      for (
int qx = 0; qx < Q1D; ++qx)
 
  284         for (
int dz = 0; dz < D1D; ++dz)
 
  286            for (
int dy = 0; dy < D1D; ++dy)
 
  288               QDD[qx][dy][dz] = 0.0;
 
  289               for (
int qy = 0; qy < Q1D; ++qy)
 
  291                  QDD[qx][dy][dz] += B(qy, dy) * B(qy, dy) * QQD[qx][qy][dz];
 
  296      for (
int dz = 0; dz < D1D; ++dz)
 
  298         for (
int dy = 0; dy < D1D; ++dy)
 
  300            for (
int dx = 0; dx < D1D; ++dx)
 
  303               for (
int qx = 0; qx < Q1D; ++qx)
 
  305                  t += B(qx, dx) * B(qx, dx) * QDD[qx][dy][dz];
 
  307               Y(dx, dy, dz, e) += t;
 
  315template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  316inline void SmemPAMassAssembleDiagonal3D(
const int NE,
 
  317                                         const Array<real_t> &b_,
 
  323   const int D1D = T_D1D ? T_D1D : d1d;
 
  324   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  327   MFEM_VERIFY(D1D <= max_d1d, 
"");
 
  328   MFEM_VERIFY(Q1D <= max_q1d, 
"");
 
  329   auto b = 
Reshape(b_.Read(), Q1D, D1D);
 
  330   auto D = 
Reshape(d_.Read(), Q1D, Q1D, Q1D, NE);
 
  331   auto Y = 
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
 
  334      const int tidz = MFEM_THREAD_ID(z);
 
  335      const int D1D = T_D1D ? T_D1D : d1d;
 
  336      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  337      constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  338      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  339      MFEM_SHARED 
real_t B[MQ1][MD1];
 
  340      MFEM_SHARED 
real_t QQD[MQ1][MQ1][MD1];
 
  341      MFEM_SHARED 
real_t QDD[MQ1][MD1][MD1];
 
  344         MFEM_FOREACH_THREAD(d,y,D1D)
 
  346            MFEM_FOREACH_THREAD(q,x,Q1D)
 
  353      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  355         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  357            MFEM_FOREACH_THREAD(dz,z,D1D)
 
  359               QQD[qx][qy][dz] = 0.0;
 
  360               for (
int qz = 0; qz < Q1D; ++qz)
 
  362                  QQD[qx][qy][dz] += B[qz][dz] * B[qz][dz] * D(qx, qy, qz, e);
 
  368      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  370         MFEM_FOREACH_THREAD(dz,z,D1D)
 
  372            MFEM_FOREACH_THREAD(dy,y,D1D)
 
  374               QDD[qx][dy][dz] = 0.0;
 
  375               for (
int qy = 0; qy < Q1D; ++qy)
 
  377                  QDD[qx][dy][dz] += B[qy][dy] * B[qy][dy] * QQD[qx][qy][dz];
 
  383      MFEM_FOREACH_THREAD(dz,z,D1D)
 
  385         MFEM_FOREACH_THREAD(dy,y,D1D)
 
  387            MFEM_FOREACH_THREAD(dx,x,D1D)
 
  390               for (
int qx = 0; qx < Q1D; ++qx)
 
  392                  t += B[qx][dx] * B[qx][dx] * QDD[qx][dy][dz];
 
  394               Y(dx, dy, dz, e) += t;
 
  403void OccaPAMassApply2D(
const int D1D,
 
  406                       const Array<real_t> &B,
 
  407                       const Array<real_t> &Bt,
 
  413void OccaPAMassApply3D(
const int D1D,
 
  416                       const Array<real_t> &B,
 
  417                       const Array<real_t> &Bt,
 
  423template <
bool ACCUMULATE = true>
 
  424MFEM_HOST_DEVICE 
inline 
  425void PAMassApply2D_Element(
const int e,
 
  445      for (
int dy = 0; dy < D1D; ++dy)
 
  447         for (
int dx = 0; dx < D1D; ++dx)
 
  454   constexpr int max_D1D = DofQuadLimits::MAX_D1D;
 
  455   constexpr int max_Q1D = DofQuadLimits::MAX_Q1D;
 
  456   real_t sol_xy[max_Q1D][max_Q1D];
 
  457   for (
int qy = 0; qy < Q1D; ++qy)
 
  459      for (
int qx = 0; qx < Q1D; ++qx)
 
  461         sol_xy[qy][qx] = 0.0;
 
  464   for (
int dy = 0; dy < D1D; ++dy)
 
  467      for (
int qy = 0; qy < Q1D; ++qy)
 
  471      for (
int dx = 0; dx < D1D; ++dx)
 
  473         const real_t s = X(dx,dy,e);
 
  474         for (
int qx = 0; qx < Q1D; ++qx)
 
  476            sol_x[qx] += B(qx,dx)* s;
 
  479      for (
int qy = 0; qy < Q1D; ++qy)
 
  481         const real_t d2q = B(qy,dy);
 
  482         for (
int qx = 0; qx < Q1D; ++qx)
 
  484            sol_xy[qy][qx] += d2q * sol_x[qx];
 
  488   for (
int qy = 0; qy < Q1D; ++qy)
 
  490      for (
int qx = 0; qx < Q1D; ++qx)
 
  492         sol_xy[qy][qx] *= D(qx,qy,e);
 
  495   for (
int qy = 0; qy < Q1D; ++qy)
 
  498      for (
int dx = 0; dx < D1D; ++dx)
 
  502      for (
int qx = 0; qx < Q1D; ++qx)
 
  504         const real_t s = sol_xy[qy][qx];
 
  505         for (
int dx = 0; dx < D1D; ++dx)
 
  507            sol_x[dx] += Bt(dx,qx) * s;
 
  510      for (
int dy = 0; dy < D1D; ++dy)
 
  512         const real_t q2d = Bt(dy,qy);
 
  513         for (
int dx = 0; dx < D1D; ++dx)
 
  515            Y(dx,dy,e) += q2d * sol_x[dx];
 
  521template<
int T_D1D, 
int T_Q1D, 
int T_NBZ, 
bool ACCUMULATE = true>
 
  522MFEM_HOST_DEVICE 
inline 
  523void SmemPAMassApply2D_Element(
const int e,
 
  532   const int D1D = T_D1D ? T_D1D : d1d;
 
  533   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  534   constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
  536   constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  537   constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  538   constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
 
  545   const int tidz = MFEM_THREAD_ID(z);
 
  547   MFEM_SHARED 
real_t BBt[MQ1*MD1];
 
  550   MFEM_SHARED 
real_t sm0[NBZ][MDQ*MDQ];
 
  551   MFEM_SHARED 
real_t sm1[NBZ][MDQ*MDQ];
 
  558   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  560      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  562         X[dy][dx] = x(dx,dy,e);
 
  567      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  569         MFEM_FOREACH_THREAD(q,x,Q1D)
 
  576   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  578      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  581         for (
int dx = 0; dx < D1D; ++dx)
 
  583            dq += X[dy][dx] * B[qx][dx];
 
  589   MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  591      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  594         for (
int dy = 0; dy < D1D; ++dy)
 
  596            qq += DQ[dy][qx] * B[qy][dy];
 
  598         QQ[qy][qx] = qq * D(qx, qy, e);
 
  604      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  606         MFEM_FOREACH_THREAD(q,x,Q1D)
 
  613   MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  615      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  618         for (
int qx = 0; qx < Q1D; ++qx)
 
  620            dq += QQ[qy][qx] * Bt[dx][qx];
 
  626   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  628      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  631         for (
int qy = 0; qy < Q1D; ++qy)
 
  633            dd += (QD[qy][dx] * Bt[dy][qy]);
 
  647template <
bool ACCUMULATE = true>
 
  648MFEM_HOST_DEVICE 
inline 
  649void PAMassApply3D_Element(
const int e,
 
  663   auto D = DeviceTensor<4,const real_t>(d_, Q1D, Q1D, Q1D, NE);
 
  664   auto X = DeviceTensor<4,const real_t>(x_, D1D, D1D, D1D, NE);
 
  665   auto Y = DeviceTensor<4,real_t>(y_, D1D, D1D, D1D, NE);
 
  669      for (
int dz = 0; dz < D1D; ++dz)
 
  671         for (
int dy = 0; dy < D1D; ++dy)
 
  673            for (
int dx = 0; dx < D1D; ++dx)
 
  675               Y(dx, dy, dz, e) = 0.0;
 
  681   constexpr int max_D1D = DofQuadLimits::MAX_D1D;
 
  682   constexpr int max_Q1D = DofQuadLimits::MAX_Q1D;
 
  683   real_t sol_xyz[max_Q1D][max_Q1D][max_Q1D];
 
  684   for (
int qz = 0; qz < Q1D; ++qz)
 
  686      for (
int qy = 0; qy < Q1D; ++qy)
 
  688         for (
int qx = 0; qx < Q1D; ++qx)
 
  690            sol_xyz[qz][qy][qx] = 0.0;
 
  694   for (
int dz = 0; dz < D1D; ++dz)
 
  696      real_t sol_xy[max_Q1D][max_Q1D];
 
  697      for (
int qy = 0; qy < Q1D; ++qy)
 
  699         for (
int qx = 0; qx < Q1D; ++qx)
 
  701            sol_xy[qy][qx] = 0.0;
 
  704      for (
int dy = 0; dy < D1D; ++dy)
 
  707         for (
int qx = 0; qx < Q1D; ++qx)
 
  711         for (
int dx = 0; dx < D1D; ++dx)
 
  713            const real_t s = X(dx,dy,dz,e);
 
  714            for (
int qx = 0; qx < Q1D; ++qx)
 
  716               sol_x[qx] += B(qx,dx) * s;
 
  719         for (
int qy = 0; qy < Q1D; ++qy)
 
  721            const real_t wy = B(qy,dy);
 
  722            for (
int qx = 0; qx < Q1D; ++qx)
 
  724               sol_xy[qy][qx] += wy * sol_x[qx];
 
  728      for (
int qz = 0; qz < Q1D; ++qz)
 
  730         const real_t wz = B(qz,dz);
 
  731         for (
int qy = 0; qy < Q1D; ++qy)
 
  733            for (
int qx = 0; qx < Q1D; ++qx)
 
  735               sol_xyz[qz][qy][qx] += wz * sol_xy[qy][qx];
 
  740   for (
int qz = 0; qz < Q1D; ++qz)
 
  742      for (
int qy = 0; qy < Q1D; ++qy)
 
  744         for (
int qx = 0; qx < Q1D; ++qx)
 
  746            sol_xyz[qz][qy][qx] *= D(qx,qy,qz,e);
 
  750   for (
int qz = 0; qz < Q1D; ++qz)
 
  752      real_t sol_xy[max_D1D][max_D1D];
 
  753      for (
int dy = 0; dy < D1D; ++dy)
 
  755         for (
int dx = 0; dx < D1D; ++dx)
 
  760      for (
int qy = 0; qy < Q1D; ++qy)
 
  763         for (
int dx = 0; dx < D1D; ++dx)
 
  767         for (
int qx = 0; qx < Q1D; ++qx)
 
  769            const real_t s = sol_xyz[qz][qy][qx];
 
  770            for (
int dx = 0; dx < D1D; ++dx)
 
  772               sol_x[dx] += Bt(dx,qx) * s;
 
  775         for (
int dy = 0; dy < D1D; ++dy)
 
  777            const real_t wy = Bt(dy,qy);
 
  778            for (
int dx = 0; dx < D1D; ++dx)
 
  780               sol_xy[dy][dx] += wy * sol_x[dx];
 
  784      for (
int dz = 0; dz < D1D; ++dz)
 
  786         const real_t wz = Bt(dz,qz);
 
  787         for (
int dy = 0; dy < D1D; ++dy)
 
  789            for (
int dx = 0; dx < D1D; ++dx)
 
  791               Y(dx,dy,dz,e) += wz * sol_xy[dy][dx];
 
  798template<
int T_D1D, 
int T_Q1D, 
bool ACCUMULATE = true>
 
  799MFEM_HOST_DEVICE 
inline 
  800void SmemPAMassApply3D_Element(
const int e,
 
  809   constexpr int D1D = T_D1D ? T_D1D : d1d;
 
  810   constexpr int Q1D = T_Q1D ? T_Q1D : q1d;
 
  811   constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  812   constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  813   constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
 
  816   auto d = DeviceTensor<4,const real_t>(d_, Q1D, Q1D, Q1D, NE);
 
  817   auto x = DeviceTensor<4,const real_t>(x_, D1D, D1D, D1D, NE);
 
  818   auto y = DeviceTensor<4,real_t>(y_, D1D, D1D, D1D, NE);
 
  820   MFEM_SHARED 
real_t sDQ[MQ1*MD1];
 
  823   MFEM_SHARED 
real_t sm0[MDQ*MDQ*MDQ];
 
  824   MFEM_SHARED 
real_t sm1[MDQ*MDQ*MDQ];
 
  831   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  833      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  836         for (
int dz = 0; dz < D1D; ++dz)
 
  838            X[dz][dy][dx] = x(dx,dy,dz,e);
 
  841      MFEM_FOREACH_THREAD(dx,x,Q1D)
 
  843         B[dx][dy] = 
b(dx,dy);
 
  847   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  849      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  853         for (
int dz = 0; dz < D1D; dz++)
 
  858         for (
int dx = 0; dx < D1D; ++dx)
 
  861            for (
int dz = 0; dz < D1D; ++dz)
 
  863               u[dz] += X[dz][dy][dx] * B[qx][dx];
 
  867         for (
int dz = 0; dz < D1D; ++dz)
 
  869            DDQ[dz][dy][qx] = 
u[dz];
 
  874   MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  876      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  880         for (
int dz = 0; dz < D1D; dz++)
 
  885         for (
int dy = 0; dy < D1D; ++dy)
 
  888            for (
int dz = 0; dz < D1D; dz++)
 
  890               u[dz] += DDQ[dz][dy][qx] * B[qy][dy];
 
  894         for (
int dz = 0; dz < D1D; dz++)
 
  896            DQQ[dz][qy][qx] = 
u[dz];
 
  901   MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  903      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  907         for (
int qz = 0; qz < Q1D; qz++)
 
  912         for (
int dz = 0; dz < D1D; ++dz)
 
  915            for (
int qz = 0; qz < Q1D; qz++)
 
  917               u[qz] += DQQ[dz][qy][qx] * B[qz][dz];
 
  921         for (
int qz = 0; qz < Q1D; qz++)
 
  923            QQQ[qz][qy][qx] = 
u[qz] * d(qx,qy,qz,e);
 
  928   MFEM_FOREACH_THREAD(di,y,D1D)
 
  930      MFEM_FOREACH_THREAD(q,x,Q1D)
 
  936   MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  938      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  942         for (
int qz = 0; qz < Q1D; ++qz)
 
  947         for (
int qx = 0; qx < Q1D; ++qx)
 
  950            for (
int qz = 0; qz < Q1D; ++qz)
 
  952               u[qz] += QQQ[qz][qy][qx] * Bt[dx][qx];
 
  956         for (
int qz = 0; qz < Q1D; ++qz)
 
  958            QQD[qz][qy][dx] = 
u[qz];
 
  963   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  965      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  969         for (
int qz = 0; qz < Q1D; ++qz)
 
  974         for (
int qy = 0; qy < Q1D; ++qy)
 
  977            for (
int qz = 0; qz < Q1D; ++qz)
 
  979               u[qz] += QQD[qz][qy][dx] * Bt[dy][qy];
 
  983         for (
int qz = 0; qz < Q1D; ++qz)
 
  985            QDD[qz][dy][dx] = 
u[qz];
 
  990   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  992      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  996         for (
int dz = 0; dz < D1D; ++dz)
 
 1001         for (
int qz = 0; qz < Q1D; ++qz)
 
 1004            for (
int dz = 0; dz < D1D; ++dz)
 
 1006               u[dz] += QDD[qz][dy][dx] * Bt[dz][qz];
 
 1010         for (
int dz = 0; dz < D1D; ++dz)
 
 1014               y(dx,dy,dz,e) += 
u[dz];
 
 1018               y(dx,dy,dz,e) = 
u[dz];
 
 1027template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 1028inline void PAMassApply2D(
const int NE,
 
 1029                          const Array<real_t> &b_,
 
 1030                          const Array<real_t> &bt_,
 
 1037   MFEM_VERIFY(T_D1D ? T_D1D : d1d <= DeviceDofQuadLimits::Get().MAX_D1D, 
"");
 
 1038   MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= DeviceDofQuadLimits::Get().MAX_Q1D, 
"");
 
 1040   const auto B = b_.Read();
 
 1041   const auto Bt = bt_.Read();
 
 1042   const auto D = d_.Read();
 
 1043   const auto X = x_.Read();
 
 1044   auto Y = y_.ReadWrite();
 
 1048      internal::PAMassApply2D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
 
 1053template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 1054inline void SmemPAMassApply2D(
const int NE,
 
 1055                              const Array<real_t> &b_,
 
 1056                              const Array<real_t> &bt_,
 
 1063   MFEM_CONTRACT_VAR(bt_);
 
 1064   static constexpr int T_NBZ = mass::NBZ(T_D1D);
 
 1065   static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
 1066   const int D1D = T_D1D ? T_D1D : d1d;
 
 1067   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1070   MFEM_VERIFY(D1D <= max_d1d, 
"");
 
 1071   MFEM_VERIFY(Q1D <= max_q1d, 
"");
 
 1072   const auto b = b_.Read();
 
 1073   const auto D = d_.Read();
 
 1074   const auto x = x_.Read();
 
 1075   auto Y = y_.ReadWrite();
 
 1078      internal::SmemPAMassApply2D_Element<T_D1D,T_Q1D,T_NBZ>(
 
 1079         e, NE, 
b, D, x, Y, d1d, q1d);
 
 1084template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 1085inline void PAMassApply3D(
const int NE,
 
 1086                          const Array<real_t> &b_,
 
 1087                          const Array<real_t> &bt_,
 
 1094   MFEM_VERIFY(T_D1D ? T_D1D : d1d <= DeviceDofQuadLimits::Get().MAX_D1D, 
"");
 
 1095   MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= DeviceDofQuadLimits::Get().MAX_Q1D, 
"");
 
 1097   const auto B = b_.Read();
 
 1098   const auto Bt = bt_.Read();
 
 1099   const auto D = d_.Read();
 
 1100   const auto X = x_.Read();
 
 1101   auto Y = y_.ReadWrite();
 
 1105      internal::PAMassApply3D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
 
 1110template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 1111inline void SmemPAMassApply3D(
const int NE,
 
 1112                              const Array<real_t> &b_,
 
 1113                              const Array<real_t> &bt_,
 
 1120   MFEM_CONTRACT_VAR(bt_);
 
 1121   const int D1D = T_D1D ? T_D1D : d1d;
 
 1122   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1125   MFEM_VERIFY(D1D <= max_d1d, 
"");
 
 1126   MFEM_VERIFY(Q1D <= max_q1d, 
"");
 
 1130   auto y = y_.ReadWrite();
 
 1133      internal::SmemPAMassApply3D_Element<T_D1D,T_Q1D>(e, NE, 
b, d, x, y, d1d, q1d);
 
 1137template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 1138inline void EAMassAssemble1D(
const int NE,
 
 1139                             const Array<real_t> &basis,
 
 1140                             const Vector &padata,
 
 1146   const int D1D = T_D1D ? T_D1D : d1d;
 
 1147   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1150   auto B = 
Reshape(basis.Read(), Q1D, D1D);
 
 1151   auto D = 
Reshape(padata.Read(), Q1D, NE);
 
 1152   auto M = 
Reshape(
add ? eadata.ReadWrite() : eadata.
Write(), D1D, D1D, NE);
 
 1155      const int D1D = T_D1D ? T_D1D : d1d;
 
 1156      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1157      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
 1158      MFEM_FOREACH_THREAD(i1,x,D1D)
 
 1161         for (
int q = 0; q < Q1D; q++) { r_Bi[q] = B(q,i1); }
 
 1162         MFEM_FOREACH_THREAD(j1,y,D1D)
 
 1165            for (
int q = 0; q < Q1D; q++) { r_Bj[q] = B(q,j1); }
 
 1168            for (
int k1 = 0; k1 < Q1D; ++k1)
 
 1170               val += r_Bi[k1] * r_Bj[k1] * D(k1, e);
 
 1174               M(i1, j1, e) += val;
 
 1185template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 1186inline void EAMassAssemble2D(
const int NE,
 
 1187                             const Array<real_t> &basis,
 
 1188                             const Vector &padata,
 
 1194   const int D1D = T_D1D ? T_D1D : d1d;
 
 1195   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1198   auto B = 
Reshape(basis.Read(), Q1D, D1D);
 
 1199   auto D = 
Reshape(padata.Read(), Q1D, Q1D, NE);
 
 1200   auto M = 
Reshape(
add ? eadata.ReadWrite() : eadata.
Write(), D1D, D1D, D1D, D1D,
 
 1204      const int D1D = T_D1D ? T_D1D : d1d;
 
 1205      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1206      constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
 1207      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
 1209      for (
int d = 0; d < D1D; d++)
 
 1211         for (
int q = 0; q < Q1D; q++)
 
 1216      MFEM_SHARED 
real_t s_D[MQ1][MQ1];
 
 1217      MFEM_FOREACH_THREAD(k1,x,Q1D)
 
 1219         MFEM_FOREACH_THREAD(k2,y,Q1D)
 
 1221            s_D[k1][k2] = D(k1,k2,e);
 
 1225      MFEM_FOREACH_THREAD(i1,x,D1D)
 
 1227         MFEM_FOREACH_THREAD(i2,y,D1D)
 
 1229            for (
int j1 = 0; j1 < D1D; ++j1)
 
 1231               for (
int j2 = 0; j2 < D1D; ++j2)
 
 1234                  for (
int k1 = 0; k1 < Q1D; ++k1)
 
 1236                     for (
int k2 = 0; k2 < Q1D; ++k2)
 
 1238                        val += r_B[k1][i1] * r_B[k1][j1]
 
 1239                               * r_B[k2][i2] * r_B[k2][j2]
 
 1245                     M(i1, i2, j1, j2, e) += val;
 
 1249                     M(i1, i2, j1, j2, e) = val;
 
 1258template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 1259inline void EAMassAssemble3D(
const int NE,
 
 1260                             const Array<real_t> &basis,
 
 1261                             const Vector &padata,
 
 1267   const int D1D = T_D1D ? T_D1D : d1d;
 
 1268   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1271   auto B = 
Reshape(basis.Read(), Q1D, D1D);
 
 1272   auto D = 
Reshape(padata.Read(), Q1D, Q1D, Q1D, NE);
 
 1273   auto M = 
Reshape(
add ? eadata.ReadWrite() : eadata.
Write(), D1D, D1D, D1D, D1D,
 
 1277      const int D1D = T_D1D ? T_D1D : d1d;
 
 1278      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1279      constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
 1280      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
 1281      constexpr int DQ = T_D1D * T_Q1D;
 
 1285      constexpr bool USE_REG = DQ != 0 && DQ <= 12;
 
 1286      constexpr int MD1r = USE_REG ? MD1 : 1;
 
 1287      constexpr int MQ1r = USE_REG ? MQ1 : 1;
 
 1288      constexpr int MD1s = USE_REG ? 1 : MD1;
 
 1289      constexpr int MQ1s = USE_REG ? 1 : MQ1;
 
 1291      MFEM_SHARED 
real_t s_B[MQ1s][MD1s];
 
 1293      real_t (*l_B)[MD1] = 
nullptr;
 
 1296         for (
int d = 0; d < D1D; d++)
 
 1298            for (
int q = 0; q < Q1D; q++)
 
 1303         l_B = (
real_t (*)[MD1])r_B;
 
 1307         if (MFEM_THREAD_ID(z) == 0)
 
 1309            MFEM_FOREACH_THREAD(d,x,D1D)
 
 1311               MFEM_FOREACH_THREAD(q,y,Q1D)
 
 1317         l_B = (
real_t (*)[MD1])s_B;
 
 1320      MFEM_SHARED 
real_t s_D[MQ1][MQ1][MQ1];
 
 1321      MFEM_FOREACH_THREAD(k1,x,Q1D)
 
 1323         MFEM_FOREACH_THREAD(k2,y,Q1D)
 
 1325            MFEM_FOREACH_THREAD(k3,z,Q1D)
 
 1327               s_D[k1][k2][k3] = D(k1,k2,k3,e);
 
 1332      MFEM_FOREACH_THREAD(i1,x,D1D)
 
 1334         MFEM_FOREACH_THREAD(i2,y,D1D)
 
 1336            MFEM_FOREACH_THREAD(i3,z,D1D)
 
 1338               for (
int j1 = 0; j1 < D1D; ++j1)
 
 1340                  for (
int j2 = 0; j2 < D1D; ++j2)
 
 1342                     for (
int j3 = 0; j3 < D1D; ++j3)
 
 1345                        for (
int k1 = 0; k1 < Q1D; ++k1)
 
 1347                           for (
int k2 = 0; k2 < Q1D; ++k2)
 
 1349                              for (
int k3 = 0; k3 < Q1D; ++k3)
 
 1351                                 val += l_B[k1][i1] * l_B[k1][j1]
 
 1352                                        * l_B[k2][i2] * l_B[k2][j2]
 
 1353                                        * l_B[k3][i3] * l_B[k3][j3]
 
 1360                           M(i1, i2, i3, j1, j2, j3, e) += val;
 
 1364                           M(i1, i2, i3, j1, j2, j3, e) = val;
 
 1383template<
int DIM, 
int T_D1D, 
int T_Q1D>
 
 1384ApplyKernelType MassIntegrator::ApplyPAKernels::Kernel()
 
 1386   if (
DIM == 1) { 
return internal::PAMassApply1D; }
 
 1387   else if (
DIM == 2) { 
return internal::SmemPAMassApply2D<T_D1D,T_Q1D>; }
 
 1388   else if (
DIM == 3) { 
return internal::SmemPAMassApply3D<T_D1D, T_Q1D>; }
 
 1389   else { MFEM_ABORT(
""); }
 
 1392inline ApplyKernelType MassIntegrator::ApplyPAKernels::Fallback(
 
 1395   if (
DIM == 1) { 
return internal::PAMassApply1D; }
 
 1396   else if (
DIM == 2) { 
return internal::PAMassApply2D; }
 
 1397   else if (
DIM == 3) { 
return internal::PAMassApply3D; }
 
 1398   else { MFEM_ABORT(
""); }
 
 1401template<
int DIM, 
int T_D1D, 
int T_Q1D>
 
 1402DiagonalKernelType MassIntegrator::DiagonalPAKernels::Kernel()
 
 1404   if (
DIM == 1) { 
return internal::PAMassAssembleDiagonal1D; }
 
 1405   else if (
DIM == 2) { 
return internal::SmemPAMassAssembleDiagonal2D<T_D1D,T_Q1D>; }
 
 1406   else if (
DIM == 3) { 
return internal::SmemPAMassAssembleDiagonal3D<T_D1D, T_Q1D>; }
 
 1407   else { MFEM_ABORT(
""); }
 
 1410inline DiagonalKernelType MassIntegrator::DiagonalPAKernels::Fallback(
 
 1413   if (
DIM == 1) { 
return internal::PAMassAssembleDiagonal1D; }
 
 1414   else if (
DIM == 2) { 
return internal::PAMassAssembleDiagonal2D; }
 
 1415   else if (
DIM == 3) { 
return internal::PAMassAssembleDiagonal3D; }
 
 1416   else { MFEM_ABORT(
""); }
 
void(*)(const int, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
void(*)(const int, const Array< real_t > &, const Vector &, Vector &, const int, const int) DiagonalKernelType
real_t u(const Vector &xvec)
DeviceTensor< 2, real_t > DeviceMatrix
T * Write(Memory< T > &mem, int size, bool on_dev=true)
Get a pointer for write access to mem with the mfem::Device's DeviceMemoryClass, if on_dev = true,...
void add(const Vector &v1, const Vector &v2, Vector &v)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
DeviceTensor< 2, const real_t > ConstDeviceMatrix
DeviceTensor< 3, const real_t > ConstDeviceCube
void forall_2D(int N, int X, int Y, lambda &&body)
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
DeviceTensor< 3, real_t > DeviceCube
void forall(int N, lambda &&body)
real_t p(const Vector &x, real_t t)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
int MAX_D1D
Maximum number of 1D nodal points.
int MAX_Q1D
Maximum number of 1D quadrature points.