73static void PAGradientSetup2D(
const int Q1D,
 
   76                              const Array<real_t> &w,
 
   83   const int NQ = Q1D*Q1D;
 
   85   auto J = 
Reshape(j.Read(), NQ, 2, 2, NE);
 
   86   auto DETJ = 
Reshape(detj.Read(), NQ, NE);
 
   87   auto y = 
Reshape(op.Write(), NQ, 2, 2, NE);
 
   89   const bool const_c = c.Size() == 1;
 
   90   const auto C = const_c ? 
Reshape(c.Read(), 1,1) :
 
   95      for (
int q = 0; q < NQ; ++q)
 
   97         const real_t J11 = J(q,0,0,e);
 
   98         const real_t J12 = J(q,0,1,e);
 
   99         const real_t J21 = J(q,1,0,e);
 
  100         const real_t J22 = J(q,1,1,e);
 
  102         const real_t Co = const_c ? C(0,0) : C(q,e);
 
  103         const real_t cw = W[q] * Co * (by_val ? 1.0 : 1.0/DETJ(q,e));
 
  105         y(q,0,0,e) = cw *  J22; 
 
  106         y(q,0,1,e) = cw * -J12; 
 
  107         y(q,1,0,e) = cw * -J21; 
 
  108         y(q,1,1,e) = cw *  J11; 
 
  114static void PAGradientSetup3D(
const int Q1D,
 
  117                              const Array<real_t> &w,
 
  124   const int NQ = Q1D*Q1D*Q1D;
 
  126   auto J = 
Reshape(j.Read(), NQ, 3, 3, NE);
 
  127   auto DETJ = 
Reshape(detj.Read(), NQ, NE);
 
  128   auto y = 
Reshape(op.Write(), NQ, 3, 3, NE);
 
  130   const bool const_c = c.Size() == 1;
 
  131   const auto C = const_c ? 
Reshape(c.Read(), 1,1) :
 
  136      for (
int q = 0; q < NQ; ++q)
 
  138         const real_t J11 = J(q,0,0,e);
 
  139         const real_t J21 = J(q,1,0,e);
 
  140         const real_t J31 = J(q,2,0,e);
 
  141         const real_t J12 = J(q,0,1,e);
 
  142         const real_t J22 = J(q,1,1,e);
 
  143         const real_t J32 = J(q,2,1,e);
 
  144         const real_t J13 = J(q,0,2,e);
 
  145         const real_t J23 = J(q,1,2,e);
 
  146         const real_t J33 = J(q,2,2,e);
 
  148         const real_t A11 = (J22 * J33) - (J23 * J32);
 
  149         const real_t A12 = (J32 * J13) - (J12 * J33);
 
  150         const real_t A13 = (J12 * J23) - (J22 * J13);
 
  151         const real_t A21 = (J31 * J23) - (J21 * J33);
 
  152         const real_t A22 = (J11 * J33) - (J13 * J31);
 
  153         const real_t A23 = (J21 * J13) - (J11 * J23);
 
  154         const real_t A31 = (J21 * J32) - (J31 * J22);
 
  155         const real_t A32 = (J31 * J12) - (J11 * J32);
 
  156         const real_t A33 = (J11 * J22) - (J12 * J21);
 
  158         const real_t Co = const_c ? C(0,0) : C(q,e);
 
  159         const real_t cw = W[q] * Co * (by_val ? 1.0 : 1.0/DETJ(q,e));
 
  161         y(q,0,0,e) = cw * A11; 
 
  162         y(q,0,1,e) = cw * A12; 
 
  163         y(q,0,2,e) = cw * A13; 
 
  164         y(q,1,0,e) = cw * A21; 
 
  165         y(q,1,1,e) = cw * A22; 
 
  166         y(q,1,2,e) = cw * A23; 
 
  167         y(q,2,0,e) = cw * A31; 
 
  168         y(q,2,1,e) = cw * A32; 
 
  169         y(q,2,2,e) = cw * A33; 
 
  174static void PAGradientSetup(
const int dim,
 
  180                            const Array<real_t> &W,
 
  186   if (
dim == 1) { MFEM_ABORT(
"dim==1 not supported in PAGradientSetup"); }
 
  189      PAGradientSetup2D(Q1D, NE, MAP_TYPE, W, J, DET_J, COEFF, op);
 
  193      PAGradientSetup3D(Q1D, NE, MAP_TYPE, W, J, DET_J, COEFF, op);
 
  202               "PA Only supports Ordering::byNODES!");
 
  210   const int dims = trial_fe.
GetDim();
 
  211   const int dimsToStore = dims * dims;
 
  214   ne = trial_fes.
GetNE();
 
  218   trial_dofs1D = trial_maps->
ndof;
 
  219   quad1D = trial_maps->
nqpt;
 
  221   test_dofs1D = test_maps->
ndof;
 
  222   MFEM_ASSERT(quad1D == test_maps->
nqpt,
 
  223               "PA requires test and trial space to have same number of quadrature points!");
 
  229   PAGradientSetup(
dim, trial_dofs1D, test_dofs1D, quad1D, ne,
 
 
  235template<
int T_TR_D1D = 0, 
int T_TE_D1D = 0, 
int T_Q1D = 0>
 
  236static void PAGradientApply2D(
const int NE,
 
  243                              const int tr_d1d = 0,
 
  244                              const int te_d1d = 0,
 
  247   const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  248   const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  249   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  253   auto B = 
Reshape(
b.Read(), Q1D, TR_D1D);
 
  261      const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  262      const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  263      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  266      constexpr int max_TE_D1D = T_TE_D1D ? T_TE_D1D : DofQuadLimits::MAX_D1D;
 
  267      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  269      real_t grad[max_Q1D][max_Q1D][VDIM];
 
  270      for (
int qy = 0; qy < Q1D; ++qy)
 
  272         for (
int qx = 0; qx < Q1D; ++qx)
 
  274            grad[qy][qx][0] = 0.0;
 
  275            grad[qy][qx][1] = 0.0;
 
  278      for (
int dy = 0; dy < TR_D1D; ++dy)
 
  280         real_t gradX[max_Q1D][VDIM];
 
  281         for (
int qx = 0; qx < Q1D; ++qx)
 
  286         for (
int dx = 0; dx < TR_D1D; ++dx)
 
  288            const real_t s = x(dx,dy,e);
 
  289            for (
int qx = 0; qx < Q1D; ++qx)
 
  291               gradX[qx][0] += s * G(qx,dx);
 
  292               gradX[qx][1] += s * B(qx,dx);
 
  295         for (
int qy = 0; qy < Q1D; ++qy)
 
  297            const real_t wy  = B(qy,dy);
 
  298            const real_t wDy = G(qy,dy);
 
  299            for (
int qx = 0; qx < Q1D; ++qx)
 
  301               grad[qy][qx][0] += gradX[qx][0] * wy;
 
  302               grad[qy][qx][1] += gradX[qx][1] * wDy;
 
  307      for (
int qy = 0; qy < Q1D; ++qy)
 
  309         for (
int qx = 0; qx < Q1D; ++qx)
 
  311            const int q = qx + qy * Q1D;
 
  312            const real_t gradX = grad[qy][qx][0];
 
  313            const real_t gradY = grad[qy][qx][1];
 
  315            grad[qy][qx][0] = gradX*op(q,0,0,e) + gradY*op(q,1,0,e);
 
  316            grad[qy][qx][1] = gradX*op(q,0,1,e) + gradY*op(q,1,1,e);
 
  320      for (
int qy = 0; qy < Q1D; ++qy)
 
  322         real_t opX[max_TE_D1D][VDIM];
 
  323         for (
int dx = 0; dx < TE_D1D; ++dx)
 
  327            for (
int qx = 0; qx < Q1D; ++qx)
 
  329               opX[dx][0] += Bt(dx,qx)*grad[qy][qx][0];
 
  330               opX[dx][1] += Bt(dx,qx)*grad[qy][qx][1];
 
  333         for (
int dy = 0; dy < TE_D1D; ++dy)
 
  335            const real_t wy = Bt(dy,qy);
 
  336            for (
int dx = 0; dx < TE_D1D; ++dx)
 
  338               y(dx,dy,0,e) += wy*opX[dx][0];
 
  339               y(dx,dy,1,e) += wy*opX[dx][1];
 
  348template<
int T_TR_D1D = 0, 
int T_TE_D1D = 0, 
int T_Q1D = 0>
 
  349static void PAGradientApplyTranspose2D(
const int NE,
 
  350                                       const Array<real_t> &bt,
 
  351                                       const Array<real_t> >,
 
  352                                       const Array<real_t> &
b,
 
  356                                       const int tr_d1d = 0,
 
  357                                       const int te_d1d = 0,
 
  360   const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  361   const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  362   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  366   auto Bt = 
Reshape(bt.Read(), TR_D1D, Q1D);
 
  367   auto Gt = 
Reshape(gt.Read(), TR_D1D, Q1D);
 
  368   auto B = 
Reshape(
b.Read(), Q1D, TE_D1D);
 
  369   auto op = 
Reshape(op_.Read(), Q1D*Q1D, 2,2, NE);
 
  370   auto x = 
Reshape(x_.Read(), TE_D1D, TE_D1D, 2, NE);
 
  371   auto y = 
Reshape(y_.ReadWrite(), TR_D1D, TR_D1D, NE);
 
  374      const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  375      const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  376      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  379      constexpr int max_TR_D1D = T_TR_D1D ? T_TR_D1D : DofQuadLimits::MAX_D1D;
 
  380      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  383      real_t Bxy[max_Q1D][max_Q1D][VDIM];
 
  384      for (
int qy = 0; qy < Q1D; ++qy)
 
  386         for (
int qx = 0; qx < Q1D; ++qx)
 
  388            Bxy[qy][qx][0] = 0.0;
 
  389            Bxy[qy][qx][1] = 0.0;
 
  392      for (
int dy = 0; dy < TE_D1D; ++dy)
 
  395         for (
int qx = 0; qx < Q1D; ++qx)
 
  400         for (
int dx = 0; dx < TE_D1D; ++dx)
 
  402            const real_t s0 = x(dx,dy,0,e);
 
  403            const real_t s1 = x(dx,dy,1,e);
 
  404            for (
int qx = 0; qx < Q1D; ++qx)
 
  406               Bx[qx][0] += s0 * B(qx,dx);
 
  407               Bx[qx][1] += s1 * B(qx,dx);
 
  410         for (
int qy = 0; qy < Q1D; ++qy)
 
  412            const real_t wy = B(qy,dy);
 
  413            for (
int qx = 0; qx < Q1D; ++qx)
 
  415               Bxy[qy][qx][0] += Bx[qx][0] * wy;
 
  416               Bxy[qy][qx][1] += Bx[qx][1] * wy;
 
  421      for (
int qy = 0; qy < Q1D; ++qy)
 
  423         for (
int qx = 0; qx < Q1D; ++qx)
 
  425            const int q = qx + qy * Q1D;
 
  426            const real_t Bxy0 = Bxy[qy][qx][0];
 
  427            const real_t Bxy1 = Bxy[qy][qx][1];
 
  429            Bxy[qy][qx][0] = Bxy0*op(q,0,0,e) + Bxy1*op(q,0,1,e);
 
  430            Bxy[qy][qx][1] = Bxy0*op(q,1,0,e) + Bxy1*op(q,1,1,e);
 
  434      for (
int qy = 0; qy < Q1D; ++qy)
 
  436         real_t Btx[max_TR_D1D][VDIM];
 
  437         for (
int dx = 0; dx < TR_D1D; ++dx)
 
  441            for (
int qx = 0; qx < Q1D; ++qx)
 
  443               Btx[dx][0] += Gt(dx,qx)*Bxy[qy][qx][0];
 
  444               Btx[dx][1] += Bt(dx,qx)*Bxy[qy][qx][1];
 
  447         for (
int dy = 0; dy < TR_D1D; ++dy)
 
  449            const real_t wy = Bt(dy,qy);
 
  450            const real_t wDy = Gt(dy,qy);
 
  451            for (
int dx = 0; dx < TR_D1D; ++dx)
 
  453               y(dx,dy,e) += wy*Btx[dx][0] + wDy*Btx[dx][1];
 
  461template<const 
int T_TR_D1D = 0, const 
int T_TE_D1D = 0, const 
int T_Q1D = 0>
 
  462static void PAGradientApply3D(
const int NE,
 
  463                              const Array<real_t> &
b,
 
  464                              const Array<real_t> &g,
 
  465                              const Array<real_t> &bt,
 
  473   const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  474   const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  475   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  479   auto B = 
Reshape(
b.Read(), Q1D, TR_D1D);
 
  480   auto G = 
Reshape(g.Read(), Q1D, TR_D1D);
 
  481   auto Bt = 
Reshape(bt.Read(), TE_D1D, Q1D);
 
  482   auto op = 
Reshape(op_.Read(), Q1D*Q1D*Q1D, 3,3, NE);
 
  483   auto x = 
Reshape(x_.Read(), TR_D1D, TR_D1D, TR_D1D, NE);
 
  484   auto y = 
Reshape(y_.ReadWrite(), TE_D1D, TE_D1D, TE_D1D, 3, NE);
 
  487      const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  488      const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  489      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  492      constexpr int max_TE_D1D = T_TE_D1D ? T_TE_D1D : DofQuadLimits::MAX_D1D;
 
  493      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  495      real_t grad[max_Q1D][max_Q1D][max_Q1D][VDIM];
 
  496      for (
int qz = 0; qz < Q1D; ++qz)
 
  498         for (
int qy = 0; qy < Q1D; ++qy)
 
  500            for (
int qx = 0; qx < Q1D; ++qx)
 
  502               grad[qz][qy][qx][0] = 0.0;
 
  503               grad[qz][qy][qx][1] = 0.0;
 
  504               grad[qz][qy][qx][2] = 0.0;
 
  508      for (
int dz = 0; dz < TR_D1D; ++dz)
 
  510         real_t gradXY[max_Q1D][max_Q1D][3];
 
  511         for (
int qy = 0; qy < Q1D; ++qy)
 
  513            for (
int qx = 0; qx < Q1D; ++qx)
 
  515               gradXY[qy][qx][0] = 0.0;
 
  516               gradXY[qy][qx][1] = 0.0;
 
  517               gradXY[qy][qx][2] = 0.0;
 
  520         for (
int dy = 0; dy < TR_D1D; ++dy)
 
  523            for (
int qx = 0; qx < Q1D; ++qx)
 
  528            for (
int dx = 0; dx < TR_D1D; ++dx)
 
  530               const real_t s = x(dx,dy,dz,e);
 
  531               for (
int qx = 0; qx < Q1D; ++qx)
 
  533                  gradX[qx][0] += s * B(qx,dx);
 
  534                  gradX[qx][1] += s * G(qx,dx);
 
  537            for (
int qy = 0; qy < Q1D; ++qy)
 
  539               const real_t wy  = B(qy,dy);
 
  540               const real_t wDy = G(qy,dy);
 
  541               for (
int qx = 0; qx < Q1D; ++qx)
 
  543                  const real_t wx  = gradX[qx][0];
 
  544                  const real_t wDx = gradX[qx][1];
 
  545                  gradXY[qy][qx][0] += wDx * wy;
 
  546                  gradXY[qy][qx][1] += wx  * wDy;
 
  547                  gradXY[qy][qx][2] += wx  * wy;
 
  551         for (
int qz = 0; qz < Q1D; ++qz)
 
  553            const real_t wz  = B(qz,dz);
 
  554            const real_t wDz = G(qz,dz);
 
  555            for (
int qy = 0; qy < Q1D; ++qy)
 
  557               for (
int qx = 0; qx < Q1D; ++qx)
 
  559                  grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
 
  560                  grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
 
  561                  grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
 
  567      for (
int qz = 0; qz < Q1D; ++qz)
 
  569         for (
int qy = 0; qy < Q1D; ++qy)
 
  571            for (
int qx = 0; qx < Q1D; ++qx)
 
  573               const int q = qx + (qy + qz * Q1D) * Q1D;
 
  574               const real_t gradX = grad[qz][qy][qx][0];
 
  575               const real_t gradY = grad[qz][qy][qx][1];
 
  576               const real_t gradZ = grad[qz][qy][qx][2];
 
  578               grad[qz][qy][qx][0] = gradX*op(q,0,0,e) + gradY*op(q,1,0,e) + gradZ*op(q,2,0,e);
 
  579               grad[qz][qy][qx][1] = gradX*op(q,0,1,e) + gradY*op(q,1,1,e) + gradZ*op(q,2,1,e);
 
  580               grad[qz][qy][qx][2] = gradX*op(q,0,2,e) + gradY*op(q,1,2,e) + gradZ*op(q,2,2,e);
 
  585      for (
int qz = 0; qz < Q1D; ++qz)
 
  587         real_t opXY[max_TE_D1D][max_TE_D1D][VDIM];
 
  588         for (
int dy = 0; dy < TE_D1D; ++dy)
 
  590            for (
int dx = 0; dx < TE_D1D; ++dx)
 
  592               opXY[dy][dx][0] = 0.0;
 
  593               opXY[dy][dx][1] = 0.0;
 
  594               opXY[dy][dx][2] = 0.0;
 
  597         for (
int qy = 0; qy < Q1D; ++qy)
 
  599            real_t opX[max_TE_D1D][VDIM];
 
  600            for (
int dx = 0; dx < TE_D1D; ++dx)
 
  605               for (
int qx = 0; qx < Q1D; ++qx)
 
  607                  opX[dx][0] += Bt(dx,qx)*grad[qz][qy][qx][0];
 
  608                  opX[dx][1] += Bt(dx,qx)*grad[qz][qy][qx][1];
 
  609                  opX[dx][2] += Bt(dx,qx)*grad[qz][qy][qx][2];
 
  612            for (
int dy = 0; dy < TE_D1D; ++dy)
 
  614               for (
int dx = 0; dx < TE_D1D; ++dx)
 
  616                  opXY[dy][dx][0] += Bt(dy,qy)*opX[dx][0];
 
  617                  opXY[dy][dx][1] += Bt(dy,qy)*opX[dx][1];
 
  618                  opXY[dy][dx][2] += Bt(dy,qy)*opX[dx][2];
 
  622         for (
int dz = 0; dz < TE_D1D; ++dz)
 
  624            for (
int dy = 0; dy < TE_D1D; ++dy)
 
  626               for (
int dx = 0; dx < TE_D1D; ++dx)
 
  628                  y(dx,dy,dz,0,e) += Bt(dz,qz)*opXY[dy][dx][0];
 
  629                  y(dx,dy,dz,1,e) += Bt(dz,qz)*opXY[dy][dx][1];
 
  630                  y(dx,dy,dz,2,e) += Bt(dz,qz)*opXY[dy][dx][2];
 
  640template<const 
int T_TR_D1D = 0, const 
int T_TE_D1D = 0, const 
int T_Q1D = 0>
 
  641static void PAGradientApplyTranspose3D(
const int NE,
 
  642                                       const Array<real_t> &bt,
 
  643                                       const Array<real_t> >,
 
  644                                       const Array<real_t> &
b,
 
  652   const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  653   const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  654   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  658   auto Bt = 
Reshape(bt.Read(), TR_D1D, Q1D);
 
  659   auto Gt = 
Reshape(gt.Read(), TR_D1D, Q1D);
 
  660   auto B = 
Reshape(
b.Read(), Q1D, TE_D1D);
 
  661   auto op = 
Reshape(op_.Read(), Q1D*Q1D*Q1D, 3,3, NE);
 
  662   auto x = 
Reshape(x_.Read(), TE_D1D, TE_D1D, TE_D1D, 3, NE);
 
  663   auto y = 
Reshape(y_.ReadWrite(), TR_D1D, TR_D1D, TR_D1D, NE);
 
  666      const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  667      const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  668      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  671      constexpr int max_TR_D1D = T_TR_D1D ? T_TR_D1D : DofQuadLimits::MAX_D1D;
 
  672      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  675      real_t Bxyz[max_Q1D][max_Q1D][max_Q1D][VDIM];
 
  676      for (
int qz = 0; qz < Q1D; ++qz)
 
  678         for (
int qy = 0; qy < Q1D; ++qy)
 
  680            for (
int qx = 0; qx < Q1D; ++qx)
 
  682               Bxyz[qz][qy][qx][0] = 0.0;
 
  683               Bxyz[qz][qy][qx][1] = 0.0;
 
  684               Bxyz[qz][qy][qx][2] = 0.0;
 
  688      for (
int dz = 0; dz < TE_D1D; ++dz)
 
  690         real_t Bxy[max_Q1D][max_Q1D][3];
 
  691         for (
int qy = 0; qy < Q1D; ++qy)
 
  693            for (
int qx = 0; qx < Q1D; ++qx)
 
  695               Bxy[qy][qx][0] = 0.0;
 
  696               Bxy[qy][qx][1] = 0.0;
 
  697               Bxy[qy][qx][2] = 0.0;
 
  700         for (
int dy = 0; dy < TE_D1D; ++dy)
 
  703            for (
int qx = 0; qx < Q1D; ++qx)
 
  709            for (
int dx = 0; dx < TE_D1D; ++dx)
 
  711               const real_t s0 = x(dx,dy,dz,0,e);
 
  712               const real_t s1 = x(dx,dy,dz,1,e);
 
  713               const real_t s2 = x(dx,dy,dz,2,e);
 
  714               for (
int qx = 0; qx < Q1D; ++qx)
 
  716                  Bx[qx][0] += s0 * B(qx,dx);
 
  717                  Bx[qx][1] += s1 * B(qx,dx);
 
  718                  Bx[qx][2] += s2 * B(qx,dx);
 
  721            for (
int qy = 0; qy < Q1D; ++qy)
 
  723               const real_t wy = B(qy,dy);
 
  724               for (
int qx = 0; qx < Q1D; ++qx)
 
  726                  Bxy[qy][qx][0] += Bx[qx][0] * wy;
 
  727                  Bxy[qy][qx][1] += Bx[qx][1] * wy;
 
  728                  Bxy[qy][qx][2] += Bx[qx][2] * wy;
 
  732         for (
int qz = 0; qz < Q1D; ++qz)
 
  734            const real_t wz = B(qz,dz);
 
  735            for (
int qy = 0; qy < Q1D; ++qy)
 
  737               for (
int qx = 0; qx < Q1D; ++qx)
 
  739                  Bxyz[qz][qy][qx][0] += Bxy[qy][qx][0] * wz;
 
  740                  Bxyz[qz][qy][qx][1] += Bxy[qy][qx][1] * wz;
 
  741                  Bxyz[qz][qy][qx][2] += Bxy[qy][qx][2] * wz;
 
  747      for (
int qz = 0; qz < Q1D; ++qz)
 
  749         for (
int qy = 0; qy < Q1D; ++qy)
 
  751            for (
int qx = 0; qx < Q1D; ++qx)
 
  753               const int q = qx + (qy + qz * Q1D) * Q1D;
 
  754               const real_t Bxyz0 = Bxyz[qz][qy][qx][0];
 
  755               const real_t Bxyz1 = Bxyz[qz][qy][qx][1];
 
  756               const real_t Bxyz2 = Bxyz[qz][qy][qx][2];
 
  758               Bxyz[qz][qy][qx][0] = Bxyz0*op(q,0,0,e) + Bxyz1*op(q,0,1,e) + Bxyz2*op(q,0,2,e);
 
  759               Bxyz[qz][qy][qx][1] = Bxyz0*op(q,1,0,e) + Bxyz1*op(q,1,1,e) + Bxyz2*op(q,1,2,e);
 
  760               Bxyz[qz][qy][qx][2] = Bxyz0*op(q,2,0,e) + Bxyz1*op(q,2,1,e) + Bxyz2*op(q,2,2,e);
 
  765      for (
int qz = 0; qz < Q1D; ++qz)
 
  767         real_t Btxy[max_TR_D1D][max_TR_D1D][VDIM];
 
  768         for (
int dy = 0; dy < TR_D1D; ++dy)
 
  770            for (
int dx = 0; dx < TR_D1D; ++dx)
 
  772               Btxy[dy][dx][0] = 0.0;
 
  773               Btxy[dy][dx][1] = 0.0;
 
  774               Btxy[dy][dx][2] = 0.0;
 
  777         for (
int qy = 0; qy < Q1D; ++qy)
 
  779            real_t Btx[max_TR_D1D][VDIM];
 
  780            for (
int dx = 0; dx < TR_D1D; ++dx)
 
  785               for (
int qx = 0; qx < Q1D; ++qx)
 
  787                  Btx[dx][0] += Gt(dx,qx)*Bxyz[qz][qy][qx][0];
 
  788                  Btx[dx][1] += Bt(dx,qx)*Bxyz[qz][qy][qx][1];
 
  789                  Btx[dx][2] += Bt(dx,qx)*Bxyz[qz][qy][qx][2];
 
  792            for (
int dy = 0; dy < TR_D1D; ++dy)
 
  794               const real_t wy = Bt(dy,qy);
 
  795               const real_t wDy = Gt(dy,qy);
 
  796               for (
int dx = 0; dx < TR_D1D; ++dx)
 
  798                  Btxy[dy][dx][0] += wy *Btx[dx][0];
 
  799                  Btxy[dy][dx][1] += wDy*Btx[dx][1];
 
  800                  Btxy[dy][dx][2] += wy *Btx[dx][2];
 
  804         for (
int dz = 0; dz < TR_D1D; ++dz)
 
  806            const real_t wz = Bt(dz,qz);
 
  807            const real_t wDz = Gt(dz,qz);
 
  808            for (
int dy = 0; dy < TR_D1D; ++dy)
 
  810               for (
int dx = 0; dx < TR_D1D; ++dx)
 
  812                  y(dx,dy,dz,e) += wz *Btxy[dy][dx][0] +
 
  813                                   wz *Btxy[dy][dx][1] +
 
  823template<const 
int T_TR_D1D = 0, const 
int T_TE_D1D = 0, const 
int T_Q1D = 0>
 
  824static void SmemPAGradientApply3D(
const int NE,
 
  825                                  const Array<real_t> &b_,
 
  826                                  const Array<real_t> &g_,
 
  827                                  const Array<real_t> &bt_,
 
  831                                  const int tr_d1d = 0,
 
  832                                  const int te_d1d = 0,
 
  835   const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  836   const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  837   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  841   MFEM_VERIFY(TR_D1D <= Q1D, 
"");
 
  842   MFEM_VERIFY(TE_D1D <= Q1D, 
"");
 
  845   auto b = 
Reshape(b_.Read(), Q1D, TR_D1D);
 
  846   auto g = 
Reshape(g_.Read(), Q1D, TR_D1D);
 
  847   auto bt = 
Reshape(bt_.Read(), TE_D1D, Q1D);
 
  848   auto D = 
Reshape(d_.Read(), Q1D*Q1D*Q1D, 3, 3, NE);
 
  849   auto x = 
Reshape(x_.Read(), TR_D1D, TR_D1D, TR_D1D, NE);
 
  850   auto y = 
Reshape(y_.ReadWrite(), TE_D1D, TE_D1D, TE_D1D, 3, NE);
 
  853                   [=] MFEM_HOST_DEVICE (int e)
 
  855      const int tidz = MFEM_THREAD_ID(z);
 
  856      const int D1DR = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  857      const int D1DE = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  858      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  859      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  860      constexpr int MD1R = T_TR_D1D ? T_TR_D1D : DofQuadLimits::MAX_D1D;
 
  861      constexpr int MD1E = T_TE_D1D ? T_TE_D1D : DofQuadLimits::MAX_D1D;
 
  862      constexpr int MD1 = MD1E > MD1R ? MD1E : MD1R;
 
  863      constexpr int MDQ = MQ1 > MD1 ? MQ1 : MD1;
 
  864      MFEM_SHARED 
real_t sBG[2][MQ1*MD1];
 
  868      MFEM_SHARED 
real_t sm0[3][MDQ*MDQ*MDQ];
 
  869      MFEM_SHARED 
real_t sm1[3][MDQ*MDQ*MDQ];
 
  871      real_t (*DDQ0)[MD1][MQ1] = (
real_t (*)[MD1][MQ1]) (sm0+0);
 
  872      real_t (*DDQ1)[MD1][MQ1] = (
real_t (*)[MD1][MQ1]) (sm0+1);
 
  874      real_t (*DQQ0)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+0);
 
  875      real_t (*DQQ1)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+1);
 
  876      real_t (*DQQ2)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+2);
 
  878      real_t (*QQQ0)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+0);
 
  879      real_t (*QQQ1)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+1);
 
  880      real_t (*QQQ2)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+2);
 
  882      real_t (*QQD0)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+0);
 
  883      real_t (*QQD1)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+1);
 
  884      real_t (*QQD2)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+2);
 
  886      real_t (*QDD0)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+0);
 
  887      real_t (*QDD1)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+1);
 
  888      real_t (*QDD2)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+2);
 
  889      MFEM_FOREACH_THREAD(dz,z,D1DR)
 
  891         MFEM_FOREACH_THREAD(dy,y,D1DR)
 
  893            MFEM_FOREACH_THREAD(dx,x,D1DR)
 
  895               X[dz][dy][dx] = x(dx,dy,dz,e);
 
  901         MFEM_FOREACH_THREAD(d,y,D1DR)
 
  903            MFEM_FOREACH_THREAD(q,x,Q1D)
 
  911      MFEM_FOREACH_THREAD(dz,z,D1DR)
 
  913         MFEM_FOREACH_THREAD(dy,y,D1DR)
 
  915            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  919               for (
int dx = 0; dx < D1DR; ++dx)
 
  921                  const real_t coord = X[dz][dy][dx];
 
  922                  u += coord * B[qx][dx];
 
  923                  v += coord * G[qx][dx];
 
  925               DDQ0[dz][dy][qx] = 
u;
 
  926               DDQ1[dz][dy][qx] = v;
 
  931      MFEM_FOREACH_THREAD(dz,z,D1DR)
 
  933         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  935            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  940               for (
int dy = 0; dy < D1DR; ++dy)
 
  942                  u += DDQ1[dz][dy][qx] * B[qy][dy];
 
  943                  v += DDQ0[dz][dy][qx] * G[qy][dy];
 
  944                  w += DDQ0[dz][dy][qx] * B[qy][dy];
 
  946               DQQ0[dz][qy][qx] = 
u;
 
  947               DQQ1[dz][qy][qx] = v;
 
  948               DQQ2[dz][qy][qx] = w;
 
  953      MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  955         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  957            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  962               for (
int dz = 0; dz < D1DR; ++dz)
 
  964                  u += DQQ0[dz][qy][qx] * B[qz][dz];
 
  965                  v += DQQ1[dz][qy][qx] * B[qz][dz];
 
  966                  w += DQQ2[dz][qy][qx] * G[qz][dz];
 
  968               QQQ0[qz][qy][qx] = 
u;
 
  969               QQQ1[qz][qy][qx] = v;
 
  970               QQQ2[qz][qy][qx] = w;
 
  975      MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  977         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  979            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  981               const int q = qx + (qy + qz * Q1D) * Q1D;
 
  982               const real_t gX = QQQ0[qz][qy][qx];
 
  983               const real_t gY = QQQ1[qz][qy][qx];
 
  984               const real_t gZ = QQQ2[qz][qy][qx];
 
  985               QQQ0[qz][qy][qx] = (D(q,0,0,e)*gX) + (D(q,1,0,e)*gY) + (D(q,2,0,e)*gZ);
 
  986               QQQ1[qz][qy][qx] = (D(q,0,1,e)*gX) + (D(q,1,1,e)*gY) + (D(q,2,1,e)*gZ);
 
  987               QQQ2[qz][qy][qx] = (D(q,0,2,e)*gX) + (D(q,1,2,e)*gY) + (D(q,2,2,e)*gZ);
 
  994         MFEM_FOREACH_THREAD(d,y,D1DE)
 
  996            MFEM_FOREACH_THREAD(q,x,Q1D)
 
 1003      MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1005         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1007            MFEM_FOREACH_THREAD(dx,x,D1DE)
 
 1012               for (
int qx = 0; qx < Q1D; ++qx)
 
 1014                  u += QQQ0[qz][qy][qx] * Bt[dx][qx];
 
 1015                  v += QQQ1[qz][qy][qx] * Bt[dx][qx];
 
 1016                  w += QQQ2[qz][qy][qx] * Bt[dx][qx];
 
 1018               QQD0[qz][qy][dx] = 
u;
 
 1019               QQD1[qz][qy][dx] = v;
 
 1020               QQD2[qz][qy][dx] = w;
 
 1025      MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1027         MFEM_FOREACH_THREAD(dy,y,D1DE)
 
 1029            MFEM_FOREACH_THREAD(dx,x,D1DE)
 
 1034               for (
int qy = 0; qy < Q1D; ++qy)
 
 1036                  u += QQD0[qz][qy][dx] * Bt[dy][qy];
 
 1037                  v += QQD1[qz][qy][dx] * Bt[dy][qy];
 
 1038                  w += QQD2[qz][qy][dx] * Bt[dy][qy];
 
 1040               QDD0[qz][dy][dx] = 
u;
 
 1041               QDD1[qz][dy][dx] = v;
 
 1042               QDD2[qz][dy][dx] = w;
 
 1047      MFEM_FOREACH_THREAD(dz,z,D1DE)
 
 1049         MFEM_FOREACH_THREAD(dy,y,D1DE)
 
 1051            MFEM_FOREACH_THREAD(dx,x,D1DE)
 
 1056               for (
int qz = 0; qz < Q1D; ++qz)
 
 1058                  u += QDD0[qz][dy][dx] * Bt[dz][qz];
 
 1059                  v += QDD1[qz][dy][dx] * Bt[dz][qz];
 
 1060                  w += QDD2[qz][dy][dx] * Bt[dz][qz];
 
 1062               y(dx,dy,dz,0,e) += 
u;
 
 1063               y(dx,dy,dz,1,e) += v;
 
 1064               y(dx,dy,dz,2,e) += w;
 
 1071static void PAGradientApply(
const int dim,
 
 1076                            const Array<real_t> &B,
 
 1077                            const Array<real_t> &G,
 
 1078                            const Array<real_t> &Bt,
 
 1085      return PAGradientApply2D(NE,B,G,Bt,op,x,y,TR_D1D,TE_D1D,Q1D);
 
 1089      return PAGradientApply3D(NE,B,G,Bt,op,x,y,TR_D1D,TE_D1D,Q1D);
 
 1091   MFEM_ABORT(
"Unknown kernel.");
 
 1094static void PAGradientApplyTranspose(
const int dim,
 
 1099                                     const Array<real_t> &Bt,
 
 1100                                     const Array<real_t> &Gt,
 
 1101                                     const Array<real_t> &B,
 
 1108      return PAGradientApplyTranspose2D(NE,Bt,Gt,B,op,x,y,TR_D1D,TE_D1D,Q1D);
 
 1112      return PAGradientApplyTranspose3D(NE,Bt,Gt,B,op,x,y,TR_D1D,TE_D1D,Q1D);
 
 1114   MFEM_ABORT(
"Unknown kernel.");
 
 1120   PAGradientApply(
dim, trial_dofs1D, test_dofs1D, quad1D, ne,
 
 1121                   trial_maps->
B, trial_maps->
G, test_maps->
Bt, pa_data,
 
 
 1128   PAGradientApplyTranspose(
dim, trial_dofs1D, test_dofs1D, quad1D, ne,
 
 1129                            trial_maps->
Bt, trial_maps->
Gt, test_maps->
B, pa_data,
 
 
const T * Read(bool on_dev=true) const
Shortcut for mfem::Read(a.GetMemory(), a.Size(), on_dev).
Class to represent a coefficient evaluated at quadrature points.
static MemoryType GetMemoryType()
(DEPRECATED) Equivalent to GetDeviceMemoryType().
Array< real_t > G
Gradients/divergences/curls of basis functions evaluated at quadrature points.
@ TENSOR
Tensor product representation using 1D matrices/tensors with dimensions using 1D number of quadrature...
Array< real_t > B
Basis functions evaluated at quadrature points.
int ndof
Number of degrees of freedom = number of basis functions. When mode is TENSOR, this is the 1D number.
Array< real_t > Gt
Transpose of G.
int nqpt
Number of quadrature points. When mode is TENSOR, this is the 1D number.
Array< real_t > Bt
Transpose of B.
Class FiniteElementSpace - responsible for providing FEM view of the mesh, mainly managing the set of...
Ordering::Type GetOrdering() const
Return the ordering method.
int GetNE() const
Returns number of elements in the mesh.
Mesh * GetMesh() const
Returns the mesh.
const FiniteElement * GetTypicalFE() const
Return GetFE(0) if the local mesh is not empty; otherwise return a typical FE based on the Geometry t...
Abstract class for all finite elements.
virtual const DofToQuad & GetDofToQuad(const IntegrationRule &ir, DofToQuad::Mode mode) const
Return a DofToQuad structure corresponding to the given IntegrationRule using the given DofToQuad::Mo...
int GetDim() const
Returns the reference space dimension for the finite element.
int GetMapType() const
Returns the FiniteElement::MapType of the element describing how reference functions are mapped to ph...
Vector detJ
Determinants of the Jacobians at all quadrature points.
Vector J
Jacobians of the element transformations at all quadrature points.
static const IntegrationRule & GetRule(const FiniteElement &trial_fe, const FiniteElement &test_fe, const ElementTransformation &Trans)
void AddMultTransposePA(const Vector &x, Vector &y) const override
Method for partially assembled transposed action.
void AddMultPA(const Vector &x, Vector &y) const override
Method for partially assembled action.
void AssemblePA(const FiniteElementSpace &trial_fes, const FiniteElementSpace &test_fes) override
Class for an integration rule - an Array of IntegrationPoint.
int GetNPoints() const
Returns the number of the points in the integration rule.
const Array< real_t > & GetWeights() const
Return the quadrature weights in a contiguous array.
const IntegrationRule * IntRule
int Dimension() const
Dimension of the reference space used within the elements.
ElementTransformation * GetTypicalElementTransformation()
If the local mesh is not empty return GetElementTransformation(0); otherwise, return the identity tra...
const GeometricFactors * GetGeometricFactors(const IntegrationRule &ir, const int flags, MemoryType d_mt=MemoryType::DEFAULT)
Return the mesh geometric factors corresponding to the given integration rule.
Class representing the storage layout of a QuadratureFunction.
virtual const real_t * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
virtual real_t * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(vec.GetMemory(), vec.Size(), on_dev).
void SetSize(int s)
Resize the vector to size s.
void trans(const Vector &u, Vector &x)
const T * Read(const Memory< T > &mem, int size, bool on_dev=true)
Get a pointer for read access to mem with the mfem::Device's DeviceMemoryClass, if on_dev = true,...
real_t u(const Vector &xvec)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
@ COMPRESSED
Enable all above compressions.
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
void forall(int N, lambda &&body)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.