20static void PADivergenceSetup2D(
const int Q1D,
 
   22                                const Array<real_t> &w,
 
   27   const int NQ = Q1D*Q1D;
 
   29   auto J = 
Reshape(j.Read(), NQ, 2, 2, NE);
 
   30   auto y = 
Reshape(op.Write(), NQ, 2, 2, NE);
 
   34      for (
int q = 0; q < NQ; ++q)
 
   36         const real_t J11 = J(q,0,0,e);
 
   37         const real_t J12 = J(q,0,1,e);
 
   38         const real_t J21 = J(q,1,0,e);
 
   39         const real_t J22 = J(q,1,1,e);
 
   41         y(q,0,0,e) = W[q] * COEFF *  J22; 
 
   42         y(q,0,1,e) = W[q] * COEFF * -J12; 
 
   43         y(q,1,0,e) = W[q] * COEFF * -J21; 
 
   44         y(q,1,1,e) = W[q] * COEFF *  J11; 
 
   50static void PADivergenceSetup3D(
const int Q1D,
 
   52                                const Array<real_t> &w,
 
   57   const int NQ = Q1D*Q1D*Q1D;
 
   59   auto J = 
Reshape(j.Read(), NQ, 3, 3, NE);
 
   60   auto y = 
Reshape(op.Write(), NQ, 3, 3, NE);
 
   63      for (
int q = 0; q < NQ; ++q)
 
   65         const real_t J11 = J(q,0,0,e);
 
   66         const real_t J21 = J(q,1,0,e);
 
   67         const real_t J31 = J(q,2,0,e);
 
   68         const real_t J12 = J(q,0,1,e);
 
   69         const real_t J22 = J(q,1,1,e);
 
   70         const real_t J32 = J(q,2,1,e);
 
   71         const real_t J13 = J(q,0,2,e);
 
   72         const real_t J23 = J(q,1,2,e);
 
   73         const real_t J33 = J(q,2,2,e);
 
   74         const real_t cw  = W[q] * COEFF;
 
   76         const real_t A11 = (J22 * J33) - (J23 * J32);
 
   77         const real_t A12 = (J32 * J13) - (J12 * J33);
 
   78         const real_t A13 = (J12 * J23) - (J22 * J13);
 
   79         const real_t A21 = (J31 * J23) - (J21 * J33);
 
   80         const real_t A22 = (J11 * J33) - (J13 * J31);
 
   81         const real_t A23 = (J21 * J13) - (J11 * J23);
 
   82         const real_t A31 = (J21 * J32) - (J31 * J22);
 
   83         const real_t A32 = (J31 * J12) - (J11 * J32);
 
   84         const real_t A33 = (J11 * J22) - (J12 * J21);
 
   86         y(q,0,0,e) = cw * A11; 
 
   87         y(q,0,1,e) = cw * A12; 
 
   88         y(q,0,2,e) = cw * A13; 
 
   89         y(q,1,0,e) = cw * A21; 
 
   90         y(q,1,1,e) = cw * A22; 
 
   91         y(q,1,2,e) = cw * A23; 
 
   92         y(q,2,0,e) = cw * A31; 
 
   93         y(q,2,1,e) = cw * A32; 
 
   94         y(q,2,2,e) = cw * A33; 
 
   99static void PADivergenceSetup(
const int dim,
 
  104                              const Array<real_t> &W,
 
  109   if (
dim == 1) { MFEM_ABORT(
"dim==1 not supported in PADivergenceSetup"); }
 
  112      PADivergenceSetup2D(Q1D, NE, W, J, COEFF, op);
 
  116      PADivergenceSetup3D(Q1D, NE, W, J, COEFF, op);
 
  125               "PA Only supports Ordering::byNODES!");
 
  132   const int dims = trial_fe.
GetDim();
 
  133   const int dimsToStore = dims * dims;
 
  136   ne = trial_fes.
GetNE();
 
  139   trial_dofs1D = trial_maps->
ndof;
 
  140   quad1D = trial_maps->
nqpt;
 
  142   test_dofs1D = test_maps->
ndof;
 
  143   MFEM_ASSERT(quad1D == test_maps->
nqpt,
 
  144               "PA requires test and trial space to have same number of quadrature points!");
 
  150      MFEM_VERIFY(cQ != NULL, 
"only ConstantCoefficient is supported!");
 
  153   PADivergenceSetup(
dim, trial_dofs1D, test_dofs1D, quad1D,
 
 
  158template<const 
int T_TR_D1D = 0, const 
int T_TE_D1D = 0, const 
int T_Q1D = 0>
 
  159static void PADivergenceApply2D(
const int NE,
 
  166                                const int tr_d1d = 0,
 
  167                                const int te_d1d = 0,
 
  170   const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  171   const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  172   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  176   auto B = 
Reshape(
b.Read(), Q1D, TR_D1D);
 
  180   auto x = 
Reshape(x_.
Read(), TR_D1D, TR_D1D, 2, NE);
 
  184      const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  185      const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  186      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  189      constexpr int max_TE_D1D = T_TE_D1D ? T_TE_D1D : DofQuadLimits::MAX_D1D;
 
  190      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  192      real_t grad[max_Q1D][max_Q1D][VDIM];
 
  193      real_t div[max_Q1D][max_Q1D];
 
  194      for (
int qy = 0; qy < Q1D; ++qy)
 
  196         for (
int qx = 0; qx < Q1D; ++qx)
 
  202      for (
int c = 0; c < VDIM; ++c)
 
  204         for (
int qy = 0; qy < Q1D; ++qy)
 
  206            for (
int qx = 0; qx < Q1D; ++qx)
 
  208               grad[qy][qx][0] = 0.0;
 
  209               grad[qy][qx][1] = 0.0;
 
  212         for (
int dy = 0; dy < TR_D1D; ++dy)
 
  214            real_t gradX[max_Q1D][VDIM];
 
  215            for (
int qx = 0; qx < Q1D; ++qx)
 
  220            for (
int dx = 0; dx < TR_D1D; ++dx)
 
  222               const real_t s = x(dx,dy,c,e);
 
  223               for (
int qx = 0; qx < Q1D; ++qx)
 
  225                  gradX[qx][0] += s * G(qx,dx);
 
  226                  gradX[qx][1] += s * B(qx,dx);
 
  229            for (
int qy = 0; qy < Q1D; ++qy)
 
  231               const real_t wy  = B(qy,dy);
 
  232               const real_t wDy = G(qy,dy);
 
  233               for (
int qx = 0; qx < Q1D; ++qx)
 
  235                  grad[qy][qx][0] += gradX[qx][0] * wy;
 
  236                  grad[qy][qx][1] += gradX[qx][1] * wDy;
 
  241         for (
int qy = 0; qy < Q1D; ++qy)
 
  243            for (
int qx = 0; qx < Q1D; ++qx)
 
  245               const int q = qx + qy * Q1D;
 
  246               const real_t gradX = grad[qy][qx][0];
 
  247               const real_t gradY = grad[qy][qx][1];
 
  249               div[qy][qx] += gradX*op(q,0,c,e) + gradY*op(q,1,c,e);
 
  254      for (
int qy = 0; qy < Q1D; ++qy)
 
  257         for (
int dx = 0; dx < TE_D1D; ++dx)
 
  260            for (
int qx = 0; qx < Q1D; ++qx)
 
  262               opX[dx] += Bt(dx,qx)*div[qy][qx];
 
  265         for (
int dy = 0; dy < TE_D1D; ++dy)
 
  267            for (
int dx = 0; dx < TE_D1D; ++dx)
 
  269               y(dx,dy,e) += Bt(dy,qy)*opX[dx];
 
  278template<
const int T_TR_D1D = 0, 
const int T_TE_D1D = 0, 
const int T_Q1D = 0,
 
  280static void SmemPADivergenceApply2D(
const int NE,
 
  281                                    const Array<real_t> &b_,
 
  282                                    const Array<real_t> &g_,
 
  283                                    const Array<real_t> &bt_,
 
  287                                    const int tr_d1d = 0,
 
  288                                    const int te_d1d = 0,
 
  292   MFEM_ASSERT(
false, 
"SHARED MEM NOT PROGRAMMED YET");
 
  296template<const 
int T_TR_D1D = 0, const 
int T_TE_D1D = 0, const 
int T_Q1D = 0>
 
  297static void PADivergenceApplyTranspose2D(
const int NE,
 
  298                                         const Array<real_t> &bt,
 
  299                                         const Array<real_t> >,
 
  300                                         const Array<real_t> &
b,
 
  304                                         const int tr_d1d = 0,
 
  305                                         const int te_d1d = 0,
 
  308   const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  309   const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  310   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  314   auto Bt = 
Reshape(bt.Read(), TR_D1D, Q1D);
 
  315   auto Gt = 
Reshape(gt.Read(), TR_D1D, Q1D);
 
  316   auto B  = 
Reshape(
b.Read(), Q1D, TE_D1D);
 
  317   auto op = 
Reshape(op_.Read(), Q1D*Q1D, 2,2, NE);
 
  318   auto x  = 
Reshape(x_.Read(), TE_D1D, TE_D1D, NE);
 
  319   auto y  = 
Reshape(y_.ReadWrite(), TR_D1D, TR_D1D, 2, NE);
 
  322      const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  323      const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  324      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  327      constexpr int max_TR_D1D = T_TR_D1D ? T_TR_D1D : DofQuadLimits::MAX_D1D;
 
  328      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  330      real_t quadTest[max_Q1D][max_Q1D];
 
  331      real_t grad[max_Q1D][max_Q1D][VDIM];
 
  332      for (
int qy = 0; qy < Q1D; ++qy)
 
  334         for (
int qx = 0; qx < Q1D; ++qx)
 
  336            quadTest[qy][qx] = 0.0;
 
  339      for (
int dy = 0; dy < TE_D1D; ++dy)
 
  341         real_t quadTestX[max_Q1D];
 
  342         for (
int qx = 0; qx < Q1D; ++qx)
 
  346         for (
int dx = 0; dx < TE_D1D; ++dx)
 
  348            const real_t s = x(dx,dy,e);
 
  349            for (
int qx = 0; qx < Q1D; ++qx)
 
  351               quadTestX[qx] += s * B(qx,dx);
 
  354         for (
int qy = 0; qy < Q1D; ++qy)
 
  356            const real_t wy = B(qy,dy);
 
  357            for (
int qx = 0; qx < Q1D; ++qx)
 
  359               quadTest[qy][qx] += quadTestX[qx] * wy;
 
  364      for (
int c = 0; c < VDIM; ++c)
 
  366         for (
int qy = 0; qy < Q1D; ++qy)
 
  368            for (
int qx = 0; qx < Q1D; ++qx)
 
  370               const int q = qx + qy * Q1D;
 
  371               grad[qy][qx][0] = quadTest[qy][qx]*op(q,0,c,e);
 
  372               grad[qy][qx][1] = quadTest[qy][qx]*op(q,1,c,e);
 
  376         for (
int qy = 0; qy < Q1D; ++qy)
 
  378            real_t gradX[max_TR_D1D][VDIM];
 
  379            for (
int dx = 0; dx < TR_D1D; ++dx)
 
  384            for (
int qx = 0; qx < Q1D; ++qx)
 
  386               const real_t gX = grad[qy][qx][0];
 
  387               const real_t gY = grad[qy][qx][1];
 
  388               for (
int dx = 0; dx < TR_D1D; ++dx)
 
  390                  const real_t wx  = Bt(dx,qx);
 
  391                  const real_t wDx = Gt(dx,qx);
 
  392                  gradX[dx][0] += gX * wDx;
 
  393                  gradX[dx][1] += gY * wx;
 
  396            for (
int dy = 0; dy < TR_D1D; ++dy)
 
  398               const real_t wy  = Bt(dy,qy);
 
  399               const real_t wDy = Gt(dy,qy);
 
  400               for (
int dx = 0; dx < TR_D1D; ++dx)
 
  402                  y(dx,dy,c,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
 
  412template<const 
int T_TR_D1D = 0, const 
int T_TE_D1D = 0, const 
int T_Q1D = 0>
 
  413static void PADivergenceApply3D(
const int NE,
 
  414                                const Array<real_t> &
b,
 
  415                                const Array<real_t> &g,
 
  416                                const Array<real_t> &bt,
 
  424   const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  425   const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  426   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  430   auto B = 
Reshape(
b.Read(), Q1D, TR_D1D);
 
  431   auto G = 
Reshape(g.Read(), Q1D, TR_D1D);
 
  432   auto Bt = 
Reshape(bt.Read(), TE_D1D, Q1D);
 
  433   auto op = 
Reshape(op_.Read(), Q1D*Q1D*Q1D, 3,3, NE);
 
  434   auto x = 
Reshape(x_.Read(), TR_D1D, TR_D1D, TR_D1D, 3, NE);
 
  435   auto y = 
Reshape(y_.ReadWrite(), TE_D1D, TE_D1D, TE_D1D, NE);
 
  438      const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  439      const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  440      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  443      constexpr int max_TE_D1D = T_TE_D1D ? T_TE_D1D : DofQuadLimits::MAX_D1D;
 
  444      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  446      real_t grad[max_Q1D][max_Q1D][max_Q1D][VDIM];
 
  447      real_t div[max_Q1D][max_Q1D][max_Q1D];
 
  448      for (
int qz = 0; qz < Q1D; ++qz)
 
  450         for (
int qy = 0; qy < Q1D; ++qy)
 
  452            for (
int qx = 0; qx < Q1D; ++qx)
 
  454               div[qz][qy][qx] = 0.0;
 
  459      for (
int c = 0; c < VDIM; ++c)
 
  461         for (
int qz = 0; qz < Q1D; ++qz)
 
  463            for (
int qy = 0; qy < Q1D; ++qy)
 
  465               for (
int qx = 0; qx < Q1D; ++qx)
 
  467                  grad[qz][qy][qx][0] = 0.0;
 
  468                  grad[qz][qy][qx][1] = 0.0;
 
  469                  grad[qz][qy][qx][2] = 0.0;
 
  473         for (
int dz = 0; dz < TR_D1D; ++dz)
 
  475            real_t gradXY[max_Q1D][max_Q1D][VDIM];
 
  476            for (
int qy = 0; qy < Q1D; ++qy)
 
  478               for (
int qx = 0; qx < Q1D; ++qx)
 
  480                  gradXY[qy][qx][0] = 0.0;
 
  481                  gradXY[qy][qx][1] = 0.0;
 
  482                  gradXY[qy][qx][2] = 0.0;
 
  485            for (
int dy = 0; dy < TR_D1D; ++dy)
 
  487               real_t gradX[max_Q1D][VDIM];
 
  488               for (
int qx = 0; qx < Q1D; ++qx)
 
  494               for (
int dx = 0; dx < TR_D1D; ++dx)
 
  496                  const real_t s = x(dx,dy,dz,c,e);
 
  497                  for (
int qx = 0; qx < Q1D; ++qx)
 
  499                     gradX[qx][0] += s * G(qx,dx);
 
  500                     gradX[qx][1] += s * B(qx,dx);
 
  501                     gradX[qx][2] += s * B(qx,dx);
 
  504               for (
int qy = 0; qy < Q1D; ++qy)
 
  506                  const real_t wy  = B(qy,dy);
 
  507                  const real_t wDy = G(qy,dy);
 
  508                  for (
int qx = 0; qx < Q1D; ++qx)
 
  510                     gradXY[qy][qx][0] += gradX[qx][0] * wy;
 
  511                     gradXY[qy][qx][1] += gradX[qx][1] * wDy;
 
  512                     gradXY[qy][qx][2] += gradX[qx][2] * wy;
 
  516            for (
int qz = 0; qz < Q1D; ++qz)
 
  518               const real_t wz  = B(qz,dz);
 
  519               const real_t wDz = G(qz,dz);
 
  520               for (
int qy = 0; qy < Q1D; ++qy)
 
  522                  for (
int qx = 0; qx < Q1D; ++qx)
 
  524                     grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
 
  525                     grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
 
  526                     grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
 
  532         for (
int qz = 0; qz < Q1D; ++qz)
 
  534            for (
int qy = 0; qy < Q1D; ++qy)
 
  536               for (
int qx = 0; qx < Q1D; ++qx)
 
  538                  const int q = qx + (qy + qz * Q1D) * Q1D;
 
  539                  const real_t gradX = grad[qz][qy][qx][0];
 
  540                  const real_t gradY = grad[qz][qy][qx][1];
 
  541                  const real_t gradZ = grad[qz][qy][qx][2];
 
  543                  div[qz][qy][qx] += gradX*op(q,0,c,e) + gradY*op(q,1,c,e) + gradZ*op(q,2,c,e);
 
  550      for (
int qz = 0; qz < Q1D; ++qz)
 
  552         real_t opXY[max_TE_D1D][max_TE_D1D];
 
  553         for (
int dy = 0; dy < TE_D1D; ++dy)
 
  555            for (
int dx = 0; dx < TE_D1D; ++dx)
 
  560         for (
int qy = 0; qy < Q1D; ++qy)
 
  563            for (
int dx = 0; dx < TE_D1D; ++dx)
 
  566               for (
int qx = 0; qx < Q1D; ++qx)
 
  568                  opX[dx] += Bt(dx,qx)*div[qz][qy][qx];
 
  571            for (
int dy = 0; dy < TE_D1D; ++dy)
 
  573               for (
int dx = 0; dx < TE_D1D; ++dx)
 
  575                  opXY[dy][dx] += Bt(dy,qy)*opX[dx];
 
  579         for (
int dz = 0; dz < TE_D1D; ++dz)
 
  581            for (
int dy = 0; dy < TE_D1D; ++dy)
 
  583               for (
int dx = 0; dx < TE_D1D; ++dx)
 
  585                  y(dx,dy,dz,e) += Bt(dz,qz)*opXY[dy][dx];
 
  595template<const 
int T_TR_D1D = 0, const 
int T_TE_D1D = 0, const 
int T_Q1D = 0>
 
  596static void PADivergenceApplyTranspose3D(
const int NE,
 
  597                                         const Array<real_t> &bt,
 
  598                                         const Array<real_t> >,
 
  599                                         const Array<real_t> &
b,
 
  607   const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  608   const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  609   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  613   auto Bt = 
Reshape(bt.Read(), TR_D1D, Q1D);
 
  614   auto Gt = 
Reshape(gt.Read(), TR_D1D, Q1D);
 
  615   auto B  = 
Reshape(
b.Read(), Q1D, TE_D1D);
 
  616   auto op = 
Reshape(op_.Read(), Q1D*Q1D*Q1D, 3,3, NE);
 
  617   auto x  = 
Reshape(x_.Read(), TE_D1D, TE_D1D, TE_D1D, NE);
 
  618   auto y  = 
Reshape(y_.ReadWrite(), TR_D1D, TR_D1D, TR_D1D, 3, NE);
 
  621      const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  622      const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  623      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  626      constexpr int max_TR_D1D = T_TR_D1D ? T_TR_D1D : DofQuadLimits::MAX_D1D;
 
  627      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  629      real_t quadTest[max_Q1D][max_Q1D][max_Q1D];
 
  630      real_t grad[max_Q1D][max_Q1D][max_Q1D][VDIM];
 
  631      for (
int qz = 0; qz < Q1D; ++qz)
 
  633         for (
int qy = 0; qy < Q1D; ++qy)
 
  635            for (
int qx = 0; qx < Q1D; ++qx)
 
  637               quadTest[qz][qy][qx] = 0.0;
 
  641      for (
int dz = 0; dz < TE_D1D; ++dz)
 
  643         real_t quadTestXY[max_Q1D][max_Q1D];
 
  644         for (
int qy = 0; qy < Q1D; ++qy)
 
  646            for (
int qx = 0; qx < Q1D; ++qx)
 
  648               quadTestXY[qy][qx] = 0.0;
 
  651         for (
int dy = 0; dy < TE_D1D; ++dy)
 
  653            real_t quadTestX[max_Q1D];
 
  654            for (
int qx = 0; qx < Q1D; ++qx)
 
  658            for (
int dx = 0; dx < TE_D1D; ++dx)
 
  660               const real_t s = x(dx,dy,dz,e);
 
  661               for (
int qx = 0; qx < Q1D; ++qx)
 
  663                  quadTestX[qx] += s * B(qx,dx);
 
  666            for (
int qy = 0; qy < Q1D; ++qy)
 
  668               const real_t wy  = B(qy,dy);
 
  669               for (
int qx = 0; qx < Q1D; ++qx)
 
  671                  quadTestXY[qy][qx] += quadTestX[qx] * wy;
 
  675         for (
int qz = 0; qz < Q1D; ++qz)
 
  677            const real_t wz  = B(qz,dz);
 
  678            for (
int qy = 0; qy < Q1D; ++qy)
 
  680               for (
int qx = 0; qx < Q1D; ++qx)
 
  682                  quadTest[qz][qy][qx] += quadTestXY[qy][qx] * wz;
 
  688      for (
int c = 0; c < VDIM; ++c)
 
  690         for (
int qz = 0; qz < Q1D; ++qz)
 
  692            for (
int qy = 0; qy < Q1D; ++qy)
 
  694               for (
int qx = 0; qx < Q1D; ++qx)
 
  696                  const int q = qx + (qy + qz * Q1D) * Q1D;
 
  697                  grad[qz][qy][qx][0] = quadTest[qz][qy][qx]*op(q,0,c,e);
 
  698                  grad[qz][qy][qx][1] = quadTest[qz][qy][qx]*op(q,1,c,e);
 
  699                  grad[qz][qy][qx][2] = quadTest[qz][qy][qx]*op(q,2,c,e);
 
  704         for (
int qz = 0; qz < Q1D; ++qz)
 
  706            real_t gradXY[max_TR_D1D][max_TR_D1D][VDIM];
 
  707            for (
int dy = 0; dy < TR_D1D; ++dy)
 
  709               for (
int dx = 0; dx < TR_D1D; ++dx)
 
  711                  gradXY[dy][dx][0] = 0.0;
 
  712                  gradXY[dy][dx][1] = 0.0;
 
  713                  gradXY[dy][dx][2] = 0.0;
 
  716            for (
int qy = 0; qy < Q1D; ++qy)
 
  718               real_t gradX[max_TR_D1D][VDIM];
 
  719               for (
int dx = 0; dx < TR_D1D; ++dx)
 
  725               for (
int qx = 0; qx < Q1D; ++qx)
 
  727                  const real_t gX = grad[qz][qy][qx][0];
 
  728                  const real_t gY = grad[qz][qy][qx][1];
 
  729                  const real_t gZ = grad[qz][qy][qx][2];
 
  730                  for (
int dx = 0; dx < TR_D1D; ++dx)
 
  732                     const real_t wx  = Bt(dx,qx);
 
  733                     const real_t wDx = Gt(dx,qx);
 
  734                     gradX[dx][0] += gX * wDx;
 
  735                     gradX[dx][1] += gY * wx;
 
  736                     gradX[dx][2] += gZ * wx;
 
  739               for (
int dy = 0; dy < TR_D1D; ++dy)
 
  741                  const real_t wy  = Bt(dy,qy);
 
  742                  const real_t wDy = Gt(dy,qy);
 
  743                  for (
int dx = 0; dx < TR_D1D; ++dx)
 
  745                     gradXY[dy][dx][0] += gradX[dx][0] * wy;
 
  746                     gradXY[dy][dx][1] += gradX[dx][1] * wDy;
 
  747                     gradXY[dy][dx][2] += gradX[dx][2] * wy;
 
  751            for (
int dz = 0; dz < TR_D1D; ++dz)
 
  753               const real_t wz  = Bt(dz,qz);
 
  754               const real_t wDz = Gt(dz,qz);
 
  755               for (
int dy = 0; dy < TR_D1D; ++dy)
 
  757                  for (
int dx = 0; dx < TR_D1D; ++dx)
 
  760                        ((gradXY[dy][dx][0] * wz) +
 
  761                         (gradXY[dy][dx][1] * wz) +
 
  762                         (gradXY[dy][dx][2] * wDz));
 
  773template<const 
int T_TR_D1D = 0, const 
int T_TE_D1D = 0, const 
int T_Q1D = 0>
 
  774static void SmemPADivergenceApply3D(
const int NE,
 
  775                                    const Array<real_t> &b_,
 
  776                                    const Array<real_t> &g_,
 
  777                                    const Array<real_t> &bt_,
 
  781                                    const int tr_d1d = 0,
 
  782                                    const int te_d1d = 0,
 
  785   const int TR_D1D = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  786   const int TE_D1D = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  787   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  793   auto b = 
Reshape(b_.Read(), Q1D, TR_D1D);
 
  794   auto g = 
Reshape(g_.Read(), Q1D, TR_D1D);
 
  795   auto bt = 
Reshape(bt_.Read(), TE_D1D, Q1D);
 
  796   auto Q = 
Reshape(q_.Read(), Q1D*Q1D*Q1D, 3,3, NE);
 
  797   auto x = 
Reshape(x_.Read(), TR_D1D, TR_D1D, TR_D1D, 3, NE);
 
  798   auto y = 
Reshape(y_.ReadWrite(), TE_D1D, TE_D1D, TE_D1D, NE);
 
  802      constexpr int VDIM = 3;
 
  803      const int tidz = MFEM_THREAD_ID(z);
 
  804      const int D1DR = T_TR_D1D ? T_TR_D1D : tr_d1d;
 
  805      const int D1DE = T_TE_D1D ? T_TE_D1D : te_d1d;
 
  806      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  807      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  808      constexpr int MD1R = T_TR_D1D ? T_TR_D1D : DofQuadLimits::MAX_D1D;
 
  809      constexpr int MD1E = T_TE_D1D ? T_TE_D1D : DofQuadLimits::MAX_D1D;
 
  810      constexpr int MD1 = MD1E > MD1R ? MD1E : MD1R;
 
  811      constexpr int MDQ = MQ1 > MD1 ? MQ1 : MD1;
 
  812      MFEM_SHARED 
real_t sBG[2][MQ1*MD1];
 
  816      MFEM_SHARED 
real_t sm0[3][MDQ*MDQ*MDQ];
 
  817      MFEM_SHARED 
real_t sm1[3][MDQ*MDQ*MDQ];
 
  819      real_t (*DDQ0)[MD1][MQ1] = (
real_t (*)[MD1][MQ1]) (sm0+0);
 
  820      real_t (*DDQ1)[MD1][MQ1] = (
real_t (*)[MD1][MQ1]) (sm0+1);
 
  821      real_t (*DQQ0)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+0);
 
  822      real_t (*DQQ1)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+1);
 
  823      real_t (*DQQ2)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+2);
 
  824      real_t (*QQQ0)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+0);
 
  825      real_t (*QQQ1)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+1);
 
  826      real_t (*QQQ2)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+2);
 
  827      real_t (*QQD0)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+0);
 
  828      real_t (*QDD0)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+0);
 
  829      MFEM_SHARED 
real_t div[MQ1][MQ1][MQ1];
 
  833         MFEM_FOREACH_THREAD(d,y,D1DR)
 
  835            MFEM_FOREACH_THREAD(q,x,Q1D)
 
  843      MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  845         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  847            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  849               div[qz][qy][qx] = 0.0;
 
  855      for (
int c = 0; c < VDIM; ++c)
 
  857         MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  859            MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  861               MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  863                  QQQ0[qz][qy][qx] = 0.0;
 
  864                  QQQ1[qz][qy][qx] = 0.0;
 
  865                  QQQ2[qz][qy][qx] = 0.0;
 
  870         MFEM_FOREACH_THREAD(dz,z,D1DR)
 
  872            MFEM_FOREACH_THREAD(dy,y,D1DR)
 
  874               MFEM_FOREACH_THREAD(dx,x,D1DR)
 
  876                  X[dz][dy][dx] = x(dx,dy,dz,c,e);
 
  881         MFEM_FOREACH_THREAD(dz,z,D1DR)
 
  883            MFEM_FOREACH_THREAD(dy,y,D1DR)
 
  885               MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  889                  for (
int dx = 0; dx < D1DR; ++dx)
 
  891                     const real_t coord = X[dz][dy][dx];
 
  892                     u += coord * B[qx][dx];
 
  893                     v += coord * G[qx][dx];
 
  895                  DDQ0[dz][dy][qx] = 
u;
 
  896                  DDQ1[dz][dy][qx] = v;
 
  901         MFEM_FOREACH_THREAD(dz,z,D1DR)
 
  903            MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  905               MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  910                  for (
int dy = 0; dy < D1DR; ++dy)
 
  912                     u += DDQ1[dz][dy][qx] * B[qy][dy];
 
  913                     v += DDQ0[dz][dy][qx] * G[qy][dy];
 
  914                     w += DDQ0[dz][dy][qx] * B[qy][dy];
 
  916                  DQQ0[dz][qy][qx] = 
u;
 
  917                  DQQ1[dz][qy][qx] = v;
 
  918                  DQQ2[dz][qy][qx] = w;
 
  923         MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  925            MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  927               MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  932                  for (
int dz = 0; dz < D1DR; ++dz)
 
  934                     u += DQQ0[dz][qy][qx] * B[qz][dz];
 
  935                     v += DQQ1[dz][qy][qx] * B[qz][dz];
 
  936                     w += DQQ2[dz][qy][qx] * G[qz][dz];
 
  938                  QQQ0[qz][qy][qx] = 
u;
 
  939                  QQQ1[qz][qy][qx] = v;
 
  940                  QQQ2[qz][qy][qx] = w;
 
  945         MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  947            MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  949               MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  951                  const int q = qx + (qy + qz * Q1D) * Q1D;
 
  952                  const real_t gX = QQQ0[qz][qy][qx];
 
  953                  const real_t gY = QQQ1[qz][qy][qx];
 
  954                  const real_t gZ = QQQ2[qz][qy][qx];
 
  955                  div[qz][qy][qx] += gX*Q(q,0,c,e) + gY*Q(q,1,c,e) + gZ*Q(q,2,c,e);
 
  964         MFEM_FOREACH_THREAD(d,y,D1DE)
 
  966            MFEM_FOREACH_THREAD(q,x,Q1D)
 
  974      MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  976         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  978            MFEM_FOREACH_THREAD(dx,x,D1DE)
 
  981               for (
int qx = 0; qx < Q1D; ++qx)
 
  983                  u += div[qz][qy][qx] * Bt[dx][qx];
 
  985               QQD0[qz][qy][dx] = 
u;
 
  990      MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  992         MFEM_FOREACH_THREAD(dy,y,D1DE)
 
  994            MFEM_FOREACH_THREAD(dx,x,D1DE)
 
  997               for (
int qy = 0; qy < Q1D; ++qy)
 
  999                  u += QQD0[qz][qy][dx] * Bt[dy][qy];
 
 1001               QDD0[qz][dy][dx] = 
u;
 
 1006      MFEM_FOREACH_THREAD(dz,z,D1DE)
 
 1008         MFEM_FOREACH_THREAD(dy,y,D1DE)
 
 1010            MFEM_FOREACH_THREAD(dx,x,D1DE)
 
 1013               for (
int qz = 0; qz < Q1D; ++qz)
 
 1015                  u += QDD0[qz][dy][dx] * Bt[dz][qz];
 
 1024static void PADivergenceApply(
const int dim,
 
 1029                              const Array<real_t> &B,
 
 1030                              const Array<real_t> &G,
 
 1031                              const Array<real_t> &Bt,
 
 1035                              bool transpose=
false)
 
 1041         return PADivergenceApplyTranspose2D(NE,B,G,Bt,op,x,y,TR_D1D,TE_D1D,Q1D);
 
 1045         return PADivergenceApply2D(NE,B,G,Bt,op,x,y,TR_D1D,TE_D1D,Q1D);
 
 1052         return PADivergenceApplyTranspose3D(NE,B,G,Bt,op,x,y,TR_D1D,TE_D1D,Q1D);
 
 1056         return PADivergenceApply3D(NE,B,G,Bt,op,x,y,TR_D1D,TE_D1D,Q1D);
 
 1059   MFEM_ABORT(
"Unknown kernel.");
 
 1065   PADivergenceApply(
dim, trial_dofs1D, test_dofs1D, quad1D, ne,
 
 1066                     trial_maps->
B, trial_maps->
G, test_maps->
Bt, pa_data, x, y,
 
 
 1074   PADivergenceApply(
dim, trial_dofs1D, test_dofs1D, quad1D, ne,
 
 1075                     trial_maps->
Bt, trial_maps->
Gt, test_maps->
B, pa_data, x, y,
 
 
const T * Read(bool on_dev=true) const
Shortcut for mfem::Read(a.GetMemory(), a.Size(), on_dev).
A coefficient that is constant across space and time.
static MemoryType GetMemoryType()
(DEPRECATED) Equivalent to GetDeviceMemoryType().
Array< real_t > G
Gradients/divergences/curls of basis functions evaluated at quadrature points.
@ TENSOR
Tensor product representation using 1D matrices/tensors with dimensions using 1D number of quadrature...
Array< real_t > B
Basis functions evaluated at quadrature points.
int ndof
Number of degrees of freedom = number of basis functions. When mode is TENSOR, this is the 1D number.
Array< real_t > Gt
Transpose of G.
int nqpt
Number of quadrature points. When mode is TENSOR, this is the 1D number.
Array< real_t > Bt
Transpose of B.
Class FiniteElementSpace - responsible for providing FEM view of the mesh, mainly managing the set of...
Ordering::Type GetOrdering() const
Return the ordering method.
int GetNE() const
Returns number of elements in the mesh.
Mesh * GetMesh() const
Returns the mesh.
const FiniteElement * GetTypicalFE() const
Return GetFE(0) if the local mesh is not empty; otherwise return a typical FE based on the Geometry t...
Abstract class for all finite elements.
virtual const DofToQuad & GetDofToQuad(const IntegrationRule &ir, DofToQuad::Mode mode) const
Return a DofToQuad structure corresponding to the given IntegrationRule using the given DofToQuad::Mo...
int GetDim() const
Returns the reference space dimension for the finite element.
Vector J
Jacobians of the element transformations at all quadrature points.
Class for an integration rule - an Array of IntegrationPoint.
int GetNPoints() const
Returns the number of the points in the integration rule.
const Array< real_t > & GetWeights() const
Return the quadrature weights in a contiguous array.
const IntegrationRule * IntRule
int Dimension() const
Dimension of the reference space used within the elements.
ElementTransformation * GetTypicalElementTransformation()
If the local mesh is not empty return GetElementTransformation(0); otherwise, return the identity tra...
const GeometricFactors * GetGeometricFactors(const IntegrationRule &ir, const int flags, MemoryType d_mt=MemoryType::DEFAULT)
Return the mesh geometric factors corresponding to the given integration rule.
void AssemblePA(const FiniteElementSpace &trial_fes, const FiniteElementSpace &test_fes) override
void AddMultTransposePA(const Vector &x, Vector &y) const override
Method for partially assembled transposed action.
static const IntegrationRule & GetRule(const FiniteElement &trial_fe, const FiniteElement &test_fe, const ElementTransformation &Trans)
void AddMultPA(const Vector &x, Vector &y) const override
Method for partially assembled action.
virtual const real_t * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
virtual real_t * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(vec.GetMemory(), vec.Size(), on_dev).
void SetSize(int s)
Resize the vector to size s.
void trans(const Vector &u, Vector &x)
real_t u(const Vector &xvec)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
void forall(int N, lambda &&body)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.