22static void PAConvectionSetup2D(
const int NQ,
 
   24                                const Array<real_t> &w,
 
   30   constexpr int DIM = 2;
 
   32   const bool const_v = 
vel.Size() == 
DIM;
 
   34   const auto W = w.Read();
 
   36   const auto V = const_v ?
 
   43      const int e = q_global / NQ;
 
   44      const int q = q_global % NQ;
 
   45      const real_t J11 = J(q,0,0,e);
 
   46      const real_t J21 = J(q,1,0,e);
 
   47      const real_t J12 = J(q,0,1,e);
 
   48      const real_t J22 = J(q,1,1,e);
 
   50      const real_t v0 = const_v ? V(0,0,0) : V(0,q,e);
 
   51      const real_t v1 = const_v ? V(1,0,0) : V(1,q,e);
 
   55      y(q,0,e) =  wx * J22 - wy * J12; 
 
   56      y(q,1,e) = -wx * J21 + wy * J11; 
 
   61static void PAConvectionSetup3D(
const int NQ,
 
   63                                const Array<real_t> &w,
 
   69   constexpr int DIM = 3;
 
   71   const auto W = 
Reshape(w.Read(), NQ);
 
   73   const bool const_v = 
vel.Size() == 
DIM;
 
   74   const auto V = const_v ?
 
   77   auto y = 
Reshape(op.Write(), NQ,3,NE);
 
   80      const int e = q_global / NQ;
 
   81      const int q = q_global % NQ;
 
   82      const real_t J11 = J(q,0,0,e);
 
   83      const real_t J12 = J(q,0,1,e);
 
   84      const real_t J13 = J(q,0,2,e);
 
   85      const real_t J21 = J(q,1,0,e);
 
   86      const real_t J22 = J(q,1,1,e);
 
   87      const real_t J23 = J(q,1,2,e);
 
   88      const real_t J31 = J(q,2,0,e);
 
   89      const real_t J32 = J(q,2,1,e);
 
   90      const real_t J33 = J(q,2,2,e);
 
   92      const real_t v0 = const_v ? V(0,0,0) : V(0,q,e);
 
   93      const real_t v1 = const_v ? V(1,0,0) : V(1,q,e);
 
   94      const real_t v2 = const_v ? V(2,0,0) : V(2,q,e);
 
   99      const real_t A11 = (J22 * J33) - (J23 * J32);
 
  100      const real_t A12 = (J32 * J13) - (J12 * J33);
 
  101      const real_t A13 = (J12 * J23) - (J22 * J13);
 
  102      const real_t A21 = (J31 * J23) - (J21 * J33);
 
  103      const real_t A22 = (J11 * J33) - (J13 * J31);
 
  104      const real_t A23 = (J21 * J13) - (J11 * J23);
 
  105      const real_t A31 = (J21 * J32) - (J31 * J22);
 
  106      const real_t A32 = (J31 * J12) - (J11 * J32);
 
  107      const real_t A33 = (J11 * J22) - (J12 * J21);
 
  109      y(q,0,e) = wx * A11 + wy * A12 + wz * A13;
 
  110      y(q,1,e) = wx * A21 + wy * A22 + wz * A23;
 
  111      y(q,2,e) = wx * A31 + wy * A32 + wz * A33;
 
  115static void PAConvectionSetup(
const int dim,
 
  118                              const Array<real_t> &W,
 
  124   if (
dim == 1) { MFEM_ABORT(
"dim==1 not supported in PAConvectionSetup"); }
 
  127      PAConvectionSetup2D(NQ, NE, W, J, coeff, 
alpha, op);
 
  131      PAConvectionSetup3D(NQ, NE, W, J, coeff, 
alpha, op);
 
  159   const int dims = el.
GetDim();
 
  160   const int symmDims = dims;
 
 
  185      MFEM_ABORT(
"AssembleDiagonalPA not yet implemented for" 
  186                 " ConvectionIntegrator.");
 
 
  191template<
int T_D1D = 0, 
int T_Q1D = 0> 
static 
  192void PAConvectionApply2D(
const int ne,
 
  204   const int D1D = T_D1D ? T_D1D : d1d;
 
  205   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  208   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  216      const int D1D = T_D1D ? T_D1D : d1d;
 
  217      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  219      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  220      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  223      for (
int dy = 0; dy < D1D; ++dy)
 
  225         for (
int dx = 0; dx < D1D; ++dx)
 
  227            u[dy][dx] = x(dx,dy,e);
 
  230      real_t Bu[max_D1D][max_Q1D];
 
  231      real_t Gu[max_D1D][max_Q1D];
 
  232      for (
int dy = 0; dy < D1D; ++dy)
 
  234         for (
int qx = 0; qx < Q1D; ++qx)
 
  238            for (
int dx = 0; dx < D1D; ++dx)
 
  240               const real_t bx = B(qx,dx);
 
  241               const real_t gx = G(qx,dx);
 
  243               Bu[dy][qx] += bx * x;
 
  244               Gu[dy][qx] += gx * x;
 
  248      real_t GBu[max_Q1D][max_Q1D];
 
  249      real_t BGu[max_Q1D][max_Q1D];
 
  250      for (
int qx = 0; qx < Q1D; ++qx)
 
  252         for (
int qy = 0; qy < Q1D; ++qy)
 
  256            for (
int dy = 0; dy < D1D; ++dy)
 
  258               const real_t bx = B(qy,dy);
 
  259               const real_t gx = G(qy,dy);
 
  260               GBu[qy][qx] += gx * Bu[dy][qx];
 
  261               BGu[qy][qx] += bx * Gu[dy][qx];
 
  266      real_t DGu[max_Q1D][max_Q1D];
 
  267      for (
int qy = 0; qy < Q1D; ++qy)
 
  269         for (
int qx = 0; qx < Q1D; ++qx)
 
  271            const real_t O1 = op(qx,qy,0,e);
 
  272            const real_t O2 = op(qx,qy,1,e);
 
  274            const real_t gradX = BGu[qy][qx];
 
  275            const real_t gradY = GBu[qy][qx];
 
  277            DGu[qy][qx] = (O1 * gradX) + (O2 * gradY);
 
  280      real_t BDGu[max_D1D][max_Q1D];
 
  281      for (
int qx = 0; qx < Q1D; ++qx)
 
  283         for (
int dy = 0; dy < D1D; ++dy)
 
  286            for (
int qy = 0; qy < Q1D; ++qy)
 
  288               const real_t w = Bt(dy,qy);
 
  289               BDGu[dy][qx] += w * DGu[qy][qx];
 
  293      for (
int dx = 0; dx < D1D; ++dx)
 
  295         for (
int dy = 0; dy < D1D; ++dy)
 
  298            for (
int qx = 0; qx < Q1D; ++qx)
 
  300               const real_t w = Bt(dx,qx);
 
  301               BBDGu += w * BDGu[dy][qx];
 
  310template<
int T_D1D = 0, 
int T_Q1D = 0, 
int T_NBZ = 0> 
static 
  311void SmemPAConvectionApply2D(
const int ne,
 
  312                             const Array<real_t> &
b,
 
  313                             const Array<real_t> &g,
 
  314                             const Array<real_t> &bt,
 
  315                             const Array<real_t> >,
 
  323   const int D1D = T_D1D ? T_D1D : d1d;
 
  324   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  325   constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
  328   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  329   auto G = 
Reshape(g.Read(), Q1D, D1D);
 
  330   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
  331   auto op = 
Reshape(op_.Read(), Q1D, Q1D, 2, NE);
 
  332   auto x = 
Reshape(x_.Read(), D1D, D1D, NE);
 
  333   auto y = 
Reshape(y_.ReadWrite(), D1D, D1D, NE);
 
  336      const int tidz = MFEM_THREAD_ID(z);
 
  337      const int D1D = T_D1D ? T_D1D : d1d;
 
  338      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  340      constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
  341      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  342      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  344      MFEM_SHARED 
real_t u[NBZ][max_D1D][max_D1D];
 
  345      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  347         MFEM_FOREACH_THREAD(dx,x,D1D)
 
  350            u[tidz][dy][dx] = x(dx,dy,e);
 
  354      MFEM_SHARED 
real_t Bu[NBZ][max_D1D][max_Q1D];
 
  355      MFEM_SHARED 
real_t Gu[NBZ][max_D1D][max_Q1D];
 
  356      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  358         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  360            Bu[tidz][dy][qx] = 0.0;
 
  361            Gu[tidz][dy][qx] = 0.0;
 
  362            for (
int dx = 0; dx < D1D; ++dx)
 
  364               const real_t bx = B(qx,dx);
 
  365               const real_t gx = G(qx,dx);
 
  366               const real_t x = 
u[tidz][dy][dx];
 
  367               Bu[tidz][dy][qx] += bx * x;
 
  368               Gu[tidz][dy][qx] += gx * x;
 
  373      MFEM_SHARED 
real_t GBu[NBZ][max_Q1D][max_Q1D];
 
  374      MFEM_SHARED 
real_t BGu[NBZ][max_Q1D][max_Q1D];
 
  375      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  377         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  379            GBu[tidz][qy][qx] = 0.0;
 
  380            BGu[tidz][qy][qx] = 0.0;
 
  381            for (
int dy = 0; dy < D1D; ++dy)
 
  383               const real_t bx = B(qy,dy);
 
  384               const real_t gx = G(qy,dy);
 
  385               GBu[tidz][qy][qx] += gx * Bu[tidz][dy][qx];
 
  386               BGu[tidz][qy][qx] += bx * Gu[tidz][dy][qx];
 
  392      MFEM_SHARED 
real_t DGu[NBZ][max_Q1D][max_Q1D];
 
  393      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  395         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  397            const real_t O1 = op(qx,qy,0,e);
 
  398            const real_t O2 = op(qx,qy,1,e);
 
  400            const real_t gradX = BGu[tidz][qy][qx];
 
  401            const real_t gradY = GBu[tidz][qy][qx];
 
  403            DGu[tidz][qy][qx] = (O1 * gradX) + (O2 * gradY);
 
  407      MFEM_SHARED 
real_t BDGu[NBZ][max_D1D][max_Q1D];
 
  408      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  410         MFEM_FOREACH_THREAD(dy,y,D1D)
 
  412            BDGu[tidz][dy][qx] = 0.0;
 
  413            for (
int qy = 0; qy < Q1D; ++qy)
 
  415               const real_t w = Bt(dy,qy);
 
  416               BDGu[tidz][dy][qx] += w * DGu[tidz][qy][qx];
 
  421      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  423         MFEM_FOREACH_THREAD(dy,y,D1D)
 
  426            for (
int qx = 0; qx < Q1D; ++qx)
 
  428               const real_t w = Bt(dx,qx);
 
  429               BBDGu += w * BDGu[tidz][dy][qx];
 
  438template<
int T_D1D = 0, 
int T_Q1D = 0> 
static 
  439void PAConvectionApply3D(
const int ne,
 
  440                         const Array<real_t> &
b,
 
  441                         const Array<real_t> &g,
 
  442                         const Array<real_t> &bt,
 
  443                         const Array<real_t> >,
 
  451   const int D1D = T_D1D ? T_D1D : d1d;
 
  452   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  455   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  456   auto G = 
Reshape(g.Read(), Q1D, D1D);
 
  457   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
  458   auto op = 
Reshape(op_.Read(), Q1D, Q1D, Q1D, 3, NE);
 
  459   auto x = 
Reshape(x_.Read(), D1D, D1D, D1D, NE);
 
  460   auto y = 
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
 
  463      const int D1D = T_D1D ? T_D1D : d1d;
 
  464      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  466      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  467      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  469      real_t u[max_D1D][max_D1D][max_D1D];
 
  470      for (
int dz = 0; dz < D1D; ++dz)
 
  472         for (
int dy = 0; dy < D1D; ++dy)
 
  474            for (
int dx = 0; dx < D1D; ++dx)
 
  476               u[dz][dy][dx] = x(dx,dy,dz,e);
 
  480      real_t Bu[max_D1D][max_D1D][max_Q1D];
 
  481      real_t Gu[max_D1D][max_D1D][max_Q1D];
 
  482      for (
int dz = 0; dz < D1D; ++dz)
 
  484         for (
int dy = 0; dy < D1D; ++dy)
 
  486            for (
int qx = 0; qx < Q1D; ++qx)
 
  488               Bu[dz][dy][qx] = 0.0;
 
  489               Gu[dz][dy][qx] = 0.0;
 
  490               for (
int dx = 0; dx < D1D; ++dx)
 
  492                  const real_t bx = B(qx,dx);
 
  493                  const real_t gx = G(qx,dx);
 
  494                  const real_t x = 
u[dz][dy][dx];
 
  495                  Bu[dz][dy][qx] += bx * x;
 
  496                  Gu[dz][dy][qx] += gx * x;
 
  501      real_t BBu[max_D1D][max_Q1D][max_Q1D];
 
  502      real_t GBu[max_D1D][max_Q1D][max_Q1D];
 
  503      real_t BGu[max_D1D][max_Q1D][max_Q1D];
 
  504      for (
int dz = 0; dz < D1D; ++dz)
 
  506         for (
int qx = 0; qx < Q1D; ++qx)
 
  508            for (
int qy = 0; qy < Q1D; ++qy)
 
  510               BBu[dz][qy][qx] = 0.0;
 
  511               GBu[dz][qy][qx] = 0.0;
 
  512               BGu[dz][qy][qx] = 0.0;
 
  513               for (
int dy = 0; dy < D1D; ++dy)
 
  515                  const real_t bx = B(qy,dy);
 
  516                  const real_t gx = G(qy,dy);
 
  517                  BBu[dz][qy][qx] += bx * Bu[dz][dy][qx];
 
  518                  GBu[dz][qy][qx] += gx * Bu[dz][dy][qx];
 
  519                  BGu[dz][qy][qx] += bx * Gu[dz][dy][qx];
 
  524      real_t GBBu[max_Q1D][max_Q1D][max_Q1D];
 
  525      real_t BGBu[max_Q1D][max_Q1D][max_Q1D];
 
  526      real_t BBGu[max_Q1D][max_Q1D][max_Q1D];
 
  527      for (
int qx = 0; qx < Q1D; ++qx)
 
  529         for (
int qy = 0; qy < Q1D; ++qy)
 
  531            for (
int qz = 0; qz < Q1D; ++qz)
 
  533               GBBu[qz][qy][qx] = 0.0;
 
  534               BGBu[qz][qy][qx] = 0.0;
 
  535               BBGu[qz][qy][qx] = 0.0;
 
  536               for (
int dz = 0; dz < D1D; ++dz)
 
  538                  const real_t bx = B(qz,dz);
 
  539                  const real_t gx = G(qz,dz);
 
  540                  GBBu[qz][qy][qx] += gx * BBu[dz][qy][qx];
 
  541                  BGBu[qz][qy][qx] += bx * GBu[dz][qy][qx];
 
  542                  BBGu[qz][qy][qx] += bx * BGu[dz][qy][qx];
 
  548      real_t DGu[max_Q1D][max_Q1D][max_Q1D];
 
  549      for (
int qz = 0; qz < Q1D; ++qz)
 
  551         for (
int qy = 0; qy < Q1D; ++qy)
 
  553            for (
int qx = 0; qx < Q1D; ++qx)
 
  555               const real_t O1 = op(qx,qy,qz,0,e);
 
  556               const real_t O2 = op(qx,qy,qz,1,e);
 
  557               const real_t O3 = op(qx,qy,qz,2,e);
 
  559               const real_t gradX = BBGu[qz][qy][qx];
 
  560               const real_t gradY = BGBu[qz][qy][qx];
 
  561               const real_t gradZ = GBBu[qz][qy][qx];
 
  563               DGu[qz][qy][qx] = (O1 * gradX) + (O2 * gradY) + (O3 * gradZ);
 
  567      real_t BDGu[max_D1D][max_Q1D][max_Q1D];
 
  568      for (
int qx = 0; qx < Q1D; ++qx)
 
  570         for (
int qy = 0; qy < Q1D; ++qy)
 
  572            for (
int dz = 0; dz < D1D; ++dz)
 
  574               BDGu[dz][qy][qx] = 0.0;
 
  575               for (
int qz = 0; qz < Q1D; ++qz)
 
  577                  const real_t w = Bt(dz,qz);
 
  578                  BDGu[dz][qy][qx] += w * DGu[qz][qy][qx];
 
  583      real_t BBDGu[max_D1D][max_D1D][max_Q1D];
 
  584      for (
int dz = 0; dz < D1D; ++dz)
 
  586         for (
int qx = 0; qx < Q1D; ++qx)
 
  588            for (
int dy = 0; dy < D1D; ++dy)
 
  590               BBDGu[dz][dy][qx] = 0.0;
 
  591               for (
int qy = 0; qy < Q1D; ++qy)
 
  593                  const real_t w = Bt(dy,qy);
 
  594                  BBDGu[dz][dy][qx] += w * BDGu[dz][qy][qx];
 
  599      for (
int dz = 0; dz < D1D; ++dz)
 
  601         for (
int dy = 0; dy < D1D; ++dy)
 
  603            for (
int dx = 0; dx < D1D; ++dx)
 
  606               for (
int qx = 0; qx < Q1D; ++qx)
 
  608                  const real_t w = Bt(dx,qx);
 
  609                  BBBDGu += w * BBDGu[dz][dy][qx];
 
  611               y(dx,dy,dz,e) += BBBDGu;
 
  619template<
int T_D1D = 0, 
int T_Q1D = 0> 
static 
  620void SmemPAConvectionApply3D(
const int ne,
 
  621                             const Array<real_t> &
b,
 
  622                             const Array<real_t> &g,
 
  623                             const Array<real_t> &bt,
 
  624                             const Array<real_t> >,
 
  632   const int D1D = T_D1D ? T_D1D : d1d;
 
  633   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  636   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  637   auto G = 
Reshape(g.Read(), Q1D, D1D);
 
  638   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
  639   auto op = 
Reshape(op_.Read(), Q1D, Q1D, Q1D, 3, NE);
 
  640   auto x = 
Reshape(x_.Read(), D1D, D1D, D1D, NE);
 
  641   auto y = 
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
 
  644      const int D1D = T_D1D ? T_D1D : d1d;
 
  645      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  647      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  648      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  649      constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D;
 
  650      MFEM_SHARED 
real_t sm0[max_DQ*max_DQ*max_DQ];
 
  651      MFEM_SHARED 
real_t sm1[max_DQ*max_DQ*max_DQ];
 
  652      MFEM_SHARED 
real_t sm2[max_DQ*max_DQ*max_DQ];
 
  653      MFEM_SHARED 
real_t sm3[max_DQ*max_DQ*max_DQ];
 
  654      MFEM_SHARED 
real_t sm4[max_DQ*max_DQ*max_DQ];
 
  655      MFEM_SHARED 
real_t sm5[max_DQ*max_DQ*max_DQ];
 
  657      real_t (*
u)[max_D1D][max_D1D] = (
real_t (*)[max_D1D][max_D1D]) sm0;
 
  658      MFEM_FOREACH_THREAD(dz,z,D1D)
 
  660         MFEM_FOREACH_THREAD(dy,y,D1D)
 
  662            MFEM_FOREACH_THREAD(dx,x,D1D)
 
  664               u[dz][dy][dx] = x(dx,dy,dz,e);
 
  669      real_t (*Bu)[max_D1D][max_Q1D] = (
real_t (*)[max_D1D][max_Q1D])sm1;
 
  670      real_t (*Gu)[max_D1D][max_Q1D] = (
real_t (*)[max_D1D][max_Q1D])sm2;
 
  671      MFEM_FOREACH_THREAD(dz,z,D1D)
 
  673         MFEM_FOREACH_THREAD(dy,y,D1D)
 
  675            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  679               for (
int dx = 0; dx < D1D; ++dx)
 
  681                  const real_t bx = B(qx,dx);
 
  682                  const real_t gx = G(qx,dx);
 
  683                  const real_t x = 
u[dz][dy][dx];
 
  687               Bu[dz][dy][qx] = Bu_;
 
  688               Gu[dz][dy][qx] = Gu_;
 
  693      real_t (*BBu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm3;
 
  694      real_t (*GBu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm4;
 
  695      real_t (*BGu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm5;
 
  696      MFEM_FOREACH_THREAD(dz,z,D1D)
 
  698         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  700            MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  705               for (
int dy = 0; dy < D1D; ++dy)
 
  707                  const real_t bx = B(qy,dy);
 
  708                  const real_t gx = G(qy,dy);
 
  709                  BBu_ += bx * Bu[dz][dy][qx];
 
  710                  GBu_ += gx * Bu[dz][dy][qx];
 
  711                  BGu_ += bx * Gu[dz][dy][qx];
 
  713               BBu[dz][qy][qx] = BBu_;
 
  714               GBu[dz][qy][qx] = GBu_;
 
  715               BGu[dz][qy][qx] = BGu_;
 
  720      real_t (*GBBu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm0;
 
  721      real_t (*BGBu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm1;
 
  722      real_t (*BBGu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm2;
 
  723      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  725         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  727            MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  732               for (
int dz = 0; dz < D1D; ++dz)
 
  734                  const real_t bx = B(qz,dz);
 
  735                  const real_t gx = G(qz,dz);
 
  736                  GBBu_ += gx * BBu[dz][qy][qx];
 
  737                  BGBu_ += bx * GBu[dz][qy][qx];
 
  738                  BBGu_ += bx * BGu[dz][qy][qx];
 
  740               GBBu[qz][qy][qx] = GBBu_;
 
  741               BGBu[qz][qy][qx] = BGBu_;
 
  742               BBGu[qz][qy][qx] = BBGu_;
 
  747      real_t (*DGu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm3;
 
  748      MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  750         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  752            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  754               const real_t O1 = op(qx,qy,qz,0,e);
 
  755               const real_t O2 = op(qx,qy,qz,1,e);
 
  756               const real_t O3 = op(qx,qy,qz,2,e);
 
  758               const real_t gradX = BBGu[qz][qy][qx];
 
  759               const real_t gradY = BGBu[qz][qy][qx];
 
  760               const real_t gradZ = GBBu[qz][qy][qx];
 
  762               DGu[qz][qy][qx] = (O1 * gradX) + (O2 * gradY) + (O3 * gradZ);
 
  767      real_t (*BDGu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm4;
 
  768      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  770         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  772            MFEM_FOREACH_THREAD(dz,z,D1D)
 
  775               for (
int qz = 0; qz < Q1D; ++qz)
 
  777                  const real_t w = Bt(dz,qz);
 
  778                  BDGu_ += w * DGu[qz][qy][qx];
 
  780               BDGu[dz][qy][qx] = BDGu_;
 
  785      real_t (*BBDGu)[max_D1D][max_Q1D] = (
real_t (*)[max_D1D][max_Q1D])sm5;
 
  786      MFEM_FOREACH_THREAD(dz,z,D1D)
 
  788         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  790            MFEM_FOREACH_THREAD(dy,y,D1D)
 
  793               for (
int qy = 0; qy < Q1D; ++qy)
 
  795                  const real_t w = Bt(dy,qy);
 
  796                  BBDGu_ += w * BDGu[dz][qy][qx];
 
  798               BBDGu[dz][dy][qx] = BBDGu_;
 
  803      MFEM_FOREACH_THREAD(dz,z,D1D)
 
  805         MFEM_FOREACH_THREAD(dy,y,D1D)
 
  807            MFEM_FOREACH_THREAD(dx,x,D1D)
 
  810               for (
int qx = 0; qx < Q1D; ++qx)
 
  812                  const real_t w = Bt(dx,qx);
 
  813                  BBBDGu += w * BBDGu[dz][dy][qx];
 
  815               y(dx,dy,dz,e) += BBBDGu;
 
  823template<
int T_D1D = 0, 
int T_Q1D = 0> 
static 
  824void PAConvectionApplyT2D(
const int ne,
 
  825                          const Array<real_t> &
b,
 
  826                          const Array<real_t> &g,
 
  827                          const Array<real_t> &bt,
 
  828                          const Array<real_t> >,
 
  836   const int D1D = T_D1D ? T_D1D : d1d;
 
  837   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  840   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  841   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
  842   auto Gt = 
Reshape(gt.Read(), D1D, Q1D);
 
  843   auto op = 
Reshape(op_.Read(), Q1D, Q1D, 2, NE);
 
  844   auto x = 
Reshape(x_.Read(), D1D, D1D, NE);
 
  845   auto y = 
Reshape(y_.ReadWrite(), D1D, D1D, NE);
 
  848      const int D1D = T_D1D ? T_D1D : d1d;
 
  849      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  851      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  852      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  855      for (
int dy = 0; dy < D1D; ++dy)
 
  857         for (
int dx = 0; dx < D1D; ++dx)
 
  859            u[dy][dx] = x(dx,dy,e);
 
  862      real_t Bu[max_D1D][max_Q1D];
 
  863      for (
int dy = 0; dy < D1D; ++dy)
 
  865         for (
int qx = 0; qx < Q1D; ++qx)
 
  868            for (
int dx = 0; dx < D1D; ++dx)
 
  870               const real_t bx = B(qx,dx);
 
  872               Bu[dy][qx] += bx * x;
 
  876      real_t BBu[max_Q1D][max_Q1D];
 
  877      for (
int qx = 0; qx < Q1D; ++qx)
 
  879         for (
int qy = 0; qy < Q1D; ++qy)
 
  882            for (
int dy = 0; dy < D1D; ++dy)
 
  884               const real_t bx = B(qy,dy);
 
  885               BBu[qy][qx] += bx * Bu[dy][qx];
 
  890      real_t DBu[max_Q1D][max_Q1D][2];
 
  891      for (
int qy = 0; qy < Q1D; ++qy)
 
  893         for (
int qx = 0; qx < Q1D; ++qx)
 
  895            const real_t O1 = op(qx,qy,0,e);
 
  896            const real_t O2 = op(qx,qy,1,e);
 
  898            const real_t X = BBu[qy][qx];
 
  900            DBu[qy][qx][0] = O1 * X;
 
  901            DBu[qy][qx][1] = O2 * X;
 
  904      real_t GDBu[max_D1D][max_Q1D][2];
 
  905      for (
int qx = 0; qx < Q1D; ++qx)
 
  907         for (
int dy = 0; dy < D1D; ++dy)
 
  909            GDBu[dy][qx][0] = 0.0;
 
  910            GDBu[dy][qx][1] = 0.0;
 
  911            for (
int qy = 0; qy < Q1D; ++qy)
 
  913               const real_t by = Bt(dy,qy);
 
  914               const real_t gy = Gt(dy,qy);
 
  915               GDBu[dy][qx][0] += by * DBu[qy][qx][0];
 
  916               GDBu[dy][qx][1] += gy * DBu[qy][qx][1];
 
  920      for (
int dx = 0; dx < D1D; ++dx)
 
  922         for (
int dy = 0; dy < D1D; ++dy)
 
  925            for (
int qx = 0; qx < Q1D; ++qx)
 
  927               const real_t bx = Bt(dx,qx);
 
  928               const real_t gx = Gt(dx,qx);
 
  929               res += gx * GDBu[dy][qx][0] + bx * GDBu[dy][qx][1];
 
  938template<
int T_D1D = 0, 
int T_Q1D = 0, 
int T_NBZ = 0> 
static 
  939void SmemPAConvectionApplyT2D(
const int ne,
 
  940                              const Array<real_t> &
b,
 
  941                              const Array<real_t> &g,
 
  942                              const Array<real_t> &bt,
 
  943                              const Array<real_t> >,
 
  951   const int D1D = T_D1D ? T_D1D : d1d;
 
  952   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  953   constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
  956   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  957   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
  958   auto Gt = 
Reshape(gt.Read(), D1D, Q1D);
 
  959   auto op = 
Reshape(op_.Read(), Q1D, Q1D, 2, NE);
 
  960   auto x = 
Reshape(x_.Read(), D1D, D1D, NE);
 
  961   auto y = 
Reshape(y_.ReadWrite(), D1D, D1D, NE);
 
  964      const int tidz = MFEM_THREAD_ID(z);
 
  965      const int D1D = T_D1D ? T_D1D : d1d;
 
  966      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  968      constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
  969      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  970      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  971      MFEM_SHARED 
real_t u[NBZ][max_D1D][max_D1D];
 
  972      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  974         MFEM_FOREACH_THREAD(dx,x,D1D)
 
  977            u[tidz][dy][dx] = x(dx,dy,e);
 
  981      MFEM_SHARED 
real_t Bu[NBZ][max_D1D][max_Q1D];
 
  982      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  984         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  986            Bu[tidz][dy][qx] = 0.0;
 
  987            for (
int dx = 0; dx < D1D; ++dx)
 
  989               const real_t bx = B(qx,dx);
 
  990               const real_t x = 
u[tidz][dy][dx];
 
  991               Bu[tidz][dy][qx] += bx * x;
 
  996      MFEM_SHARED 
real_t BBu[NBZ][max_Q1D][max_Q1D];
 
  997      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  999         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1001            BBu[tidz][qy][qx] = 0.0;
 
 1002            for (
int dy = 0; dy < D1D; ++dy)
 
 1004               const real_t bx = B(qy,dy);
 
 1005               BBu[tidz][qy][qx] += bx * Bu[tidz][dy][qx];
 
 1011      MFEM_SHARED 
real_t DBu[NBZ][max_Q1D][max_Q1D][2];
 
 1012      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1014         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1016            const real_t O1 = op(qx,qy,0,e);
 
 1017            const real_t O2 = op(qx,qy,1,e);
 
 1019            const real_t X = BBu[tidz][qy][qx];
 
 1021            DBu[tidz][qy][qx][0] = O1 * X;
 
 1022            DBu[tidz][qy][qx][1] = O2 * X;
 
 1026      MFEM_SHARED 
real_t GDBu[NBZ][max_D1D][max_Q1D][2];
 
 1027      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1029         MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1031            GDBu[tidz][dy][qx][0] = 0.0;
 
 1032            GDBu[tidz][dy][qx][1] = 0.0;
 
 1033            for (
int qy = 0; qy < Q1D; ++qy)
 
 1035               const real_t by = Bt(dy,qy);
 
 1036               const real_t gy = Gt(dy,qy);
 
 1037               GDBu[tidz][dy][qx][0] += by * DBu[tidz][qy][qx][0];
 
 1038               GDBu[tidz][dy][qx][1] += gy * DBu[tidz][qy][qx][1];
 
 1043      MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1045         MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1048            for (
int qx = 0; qx < Q1D; ++qx)
 
 1050               const real_t bx = Bt(dx,qx);
 
 1051               const real_t gx = Gt(dx,qx);
 
 1052               res += gx * GDBu[tidz][dy][qx][0] + bx * GDBu[tidz][dy][qx][1];
 
 1061template<
int T_D1D = 0, 
int T_Q1D = 0> 
static 
 1062void PAConvectionApplyT3D(
const int ne,
 
 1063                          const Array<real_t> &
b,
 
 1064                          const Array<real_t> &g,
 
 1065                          const Array<real_t> &bt,
 
 1066                          const Array<real_t> >,
 
 1074   const int D1D = T_D1D ? T_D1D : d1d;
 
 1075   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1078   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
 1079   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
 1080   auto Gt = 
Reshape(gt.Read(), D1D, Q1D);
 
 1081   auto op = 
Reshape(op_.Read(), Q1D, Q1D, Q1D, 3, NE);
 
 1082   auto x = 
Reshape(x_.Read(), D1D, D1D, D1D, NE);
 
 1083   auto y = 
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
 
 1086      const int D1D = T_D1D ? T_D1D : d1d;
 
 1087      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1089      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
 1090      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
 1092      real_t u[max_D1D][max_D1D][max_D1D];
 
 1093      for (
int dz = 0; dz < D1D; ++dz)
 
 1095         for (
int dy = 0; dy < D1D; ++dy)
 
 1097            for (
int dx = 0; dx < D1D; ++dx)
 
 1099               u[dz][dy][dx] = x(dx,dy,dz,e);
 
 1103      real_t Bu[max_D1D][max_D1D][max_Q1D];
 
 1104      for (
int dz = 0; dz < D1D; ++dz)
 
 1106         for (
int dy = 0; dy < D1D; ++dy)
 
 1108            for (
int qx = 0; qx < Q1D; ++qx)
 
 1110               Bu[dz][dy][qx] = 0.0;
 
 1111               for (
int dx = 0; dx < D1D; ++dx)
 
 1113                  const real_t bx = B(qx,dx);
 
 1114                  const real_t x = 
u[dz][dy][dx];
 
 1115                  Bu[dz][dy][qx] += bx * x;
 
 1120      real_t BBu[max_D1D][max_Q1D][max_Q1D];
 
 1121      for (
int dz = 0; dz < D1D; ++dz)
 
 1123         for (
int qx = 0; qx < Q1D; ++qx)
 
 1125            for (
int qy = 0; qy < Q1D; ++qy)
 
 1127               BBu[dz][qy][qx] = 0.0;
 
 1128               for (
int dy = 0; dy < D1D; ++dy)
 
 1130                  const real_t bx = B(qy,dy);
 
 1131                  BBu[dz][qy][qx] += bx * Bu[dz][dy][qx];
 
 1136      real_t BBBu[max_Q1D][max_Q1D][max_Q1D];
 
 1137      for (
int qx = 0; qx < Q1D; ++qx)
 
 1139         for (
int qy = 0; qy < Q1D; ++qy)
 
 1141            for (
int qz = 0; qz < Q1D; ++qz)
 
 1143               BBBu[qz][qy][qx] = 0.0;
 
 1144               for (
int dz = 0; dz < D1D; ++dz)
 
 1146                  const real_t bx = B(qz,dz);
 
 1147                  BBBu[qz][qy][qx] += bx * BBu[dz][qy][qx];
 
 1153      real_t DBu[max_Q1D][max_Q1D][max_Q1D][3];
 
 1154      for (
int qz = 0; qz < Q1D; ++qz)
 
 1156         for (
int qy = 0; qy < Q1D; ++qy)
 
 1158            for (
int qx = 0; qx < Q1D; ++qx)
 
 1160               const real_t O1 = op(qx,qy,qz,0,e);
 
 1161               const real_t O2 = op(qx,qy,qz,1,e);
 
 1162               const real_t O3 = op(qx,qy,qz,2,e);
 
 1164               const real_t X = BBBu[qz][qy][qx];
 
 1166               DBu[qz][qy][qx][0] = O1 * X;
 
 1167               DBu[qz][qy][qx][1] = O2 * X;
 
 1168               DBu[qz][qy][qx][2] = O3 * X;
 
 1172      real_t GDBu[max_D1D][max_Q1D][max_Q1D][3];
 
 1173      for (
int qx = 0; qx < Q1D; ++qx)
 
 1175         for (
int qy = 0; qy < Q1D; ++qy)
 
 1177            for (
int dz = 0; dz < D1D; ++dz)
 
 1179               GDBu[dz][qy][qx][0] = 0.0;
 
 1180               GDBu[dz][qy][qx][1] = 0.0;
 
 1181               GDBu[dz][qy][qx][2] = 0.0;
 
 1182               for (
int qz = 0; qz < Q1D; ++qz)
 
 1184                  const real_t bz = Bt(dz,qz);
 
 1185                  const real_t gz = Gt(dz,qz);
 
 1186                  GDBu[dz][qy][qx][0] += bz * DBu[qz][qy][qx][0];
 
 1187                  GDBu[dz][qy][qx][1] += bz * DBu[qz][qy][qx][1];
 
 1188                  GDBu[dz][qy][qx][2] += gz * DBu[qz][qy][qx][2];
 
 1193      real_t GGDBu[max_D1D][max_D1D][max_Q1D][3];
 
 1194      for (
int dz = 0; dz < D1D; ++dz)
 
 1196         for (
int qx = 0; qx < Q1D; ++qx)
 
 1198            for (
int dy = 0; dy < D1D; ++dy)
 
 1200               GGDBu[dz][dy][qx][0] = 0.0;
 
 1201               GGDBu[dz][dy][qx][1] = 0.0;
 
 1202               GGDBu[dz][dy][qx][2] = 0.0;
 
 1203               for (
int qy = 0; qy < Q1D; ++qy)
 
 1205                  const real_t by = Bt(dy,qy);
 
 1206                  const real_t gy = Gt(dy,qy);
 
 1207                  GGDBu[dz][dy][qx][0] += by * GDBu[dz][qy][qx][0];
 
 1208                  GGDBu[dz][dy][qx][1] += gy * GDBu[dz][qy][qx][1];
 
 1209                  GGDBu[dz][dy][qx][2] += by * GDBu[dz][qy][qx][2];
 
 1214      for (
int dz = 0; dz < D1D; ++dz)
 
 1216         for (
int dy = 0; dy < D1D; ++dy)
 
 1218            for (
int dx = 0; dx < D1D; ++dx)
 
 1221               for (
int qx = 0; qx < Q1D; ++qx)
 
 1223                  const real_t bx = Bt(dx,qx);
 
 1224                  const real_t gx = Gt(dx,qx);
 
 1225                  res += gx * GGDBu[dz][dy][qx][0];
 
 1226                  res += bx * GGDBu[dz][dy][qx][1];
 
 1227                  res += bx * GGDBu[dz][dy][qx][2];
 
 1229               y(dx,dy,dz,e) += res;
 
 1237template<
int T_D1D = 0, 
int T_Q1D = 0> 
static 
 1238void SmemPAConvectionApplyT3D(
const int ne,
 
 1239                              const Array<real_t> &
b,
 
 1240                              const Array<real_t> &g,
 
 1241                              const Array<real_t> &bt,
 
 1242                              const Array<real_t> >,
 
 1250   const int D1D = T_D1D ? T_D1D : d1d;
 
 1251   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1254   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
 1255   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
 1256   auto Gt = 
Reshape(gt.Read(), D1D, Q1D);
 
 1257   auto op = 
Reshape(op_.Read(), Q1D, Q1D, Q1D, 3, NE);
 
 1258   auto x = 
Reshape(x_.Read(), D1D, D1D, D1D, NE);
 
 1259   auto y = 
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
 
 1262      const int D1D = T_D1D ? T_D1D : d1d;
 
 1263      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1265      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
 1266      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
 1267      constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D;
 
 1268      MFEM_SHARED 
real_t sm0[3*max_DQ*max_DQ*max_DQ];
 
 1269      MFEM_SHARED 
real_t sm1[3*max_DQ*max_DQ*max_DQ];
 
 1271      real_t (*
u)[max_D1D][max_D1D] = (
real_t (*)[max_D1D][max_D1D]) sm0;
 
 1272      MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1274         MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1276            MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1278               u[dz][dy][dx] = x(dx,dy,dz,e);
 
 1283      real_t (*Bu)[max_D1D][max_Q1D] = (
real_t (*)[max_D1D][max_Q1D])sm1;
 
 1284      MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1286         MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1288            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1291               for (
int dx = 0; dx < D1D; ++dx)
 
 1293                  const real_t bx = B(qx,dx);
 
 1294                  const real_t x = 
u[dz][dy][dx];
 
 1297               Bu[dz][dy][qx] = Bu_;
 
 1302      real_t (*BBu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm0;
 
 1303      MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1305         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1307            MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1310               for (
int dy = 0; dy < D1D; ++dy)
 
 1312                  const real_t bx = B(qy,dy);
 
 1313                  BBu_ += bx * Bu[dz][dy][qx];
 
 1315               BBu[dz][qy][qx] = BBu_;
 
 1320      real_t (*BBBu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm1;
 
 1321      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1323         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1325            MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1328               for (
int dz = 0; dz < D1D; ++dz)
 
 1330                  const real_t bx = B(qz,dz);
 
 1331                  BBBu_ += bx * BBu[dz][qy][qx];
 
 1333               BBBu[qz][qy][qx] = BBBu_;
 
 1338      real_t (*DBu)[max_Q1D][max_Q1D][3] = (
real_t (*)[max_Q1D][max_Q1D][3])sm0;
 
 1339      MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1341         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1343            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1345               const real_t O1 = op(qx,qy,qz,0,e);
 
 1346               const real_t O2 = op(qx,qy,qz,1,e);
 
 1347               const real_t O3 = op(qx,qy,qz,2,e);
 
 1349               const real_t X = BBBu[qz][qy][qx];
 
 1351               DBu[qz][qy][qx][0] = O1 * X;
 
 1352               DBu[qz][qy][qx][1] = O2 * X;
 
 1353               DBu[qz][qy][qx][2] = O3 * X;
 
 1358      real_t (*GDBu)[max_Q1D][max_Q1D][3] = (
real_t (*)[max_Q1D][max_Q1D][3])sm1;
 
 1359      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1361         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1363            MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1368               for (
int qz = 0; qz < Q1D; ++qz)
 
 1370                  const real_t bz = Bt(dz,qz);
 
 1371                  const real_t gz = Gt(dz,qz);
 
 1372                  GDBu0 += bz * DBu[qz][qy][qx][0];
 
 1373                  GDBu1 += bz * DBu[qz][qy][qx][1];
 
 1374                  GDBu2 += gz * DBu[qz][qy][qx][2];
 
 1376               GDBu[dz][qy][qx][0] = GDBu0;
 
 1377               GDBu[dz][qy][qx][1] = GDBu1;
 
 1378               GDBu[dz][qy][qx][2] = GDBu2;
 
 1383      real_t (*GGDBu)[max_D1D][max_Q1D][3] = (
real_t (*)[max_D1D][max_Q1D][3])sm0;
 
 1384      MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1386         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1388            MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1393               for (
int qy = 0; qy < Q1D; ++qy)
 
 1395                  const real_t by = Bt(dy,qy);
 
 1396                  const real_t gy = Gt(dy,qy);
 
 1397                  GGDBu0 += by * GDBu[dz][qy][qx][0];
 
 1398                  GGDBu1 += gy * GDBu[dz][qy][qx][1];
 
 1399                  GGDBu2 += by * GDBu[dz][qy][qx][2];
 
 1401               GGDBu[dz][dy][qx][0] = GGDBu0;
 
 1402               GGDBu[dz][dy][qx][1] = GGDBu1;
 
 1403               GGDBu[dz][dy][qx][2] = GGDBu2;
 
 1408      MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1410         MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1412            MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1415               for (
int qx = 0; qx < Q1D; ++qx)
 
 1417                  const real_t bx = Bt(dx,qx);
 
 1418                  const real_t gx = Gt(dx,qx);
 
 1419                  res += gx * GGDBu[dz][dy][qx][0];
 
 1420                  res += bx * GGDBu[dz][dy][qx][1];
 
 1421                  res += bx * GGDBu[dz][dy][qx][2];
 
 1423               y(dx,dy,dz,e) += res;
 
 1430static void PAConvectionApply(
const int dim,
 
 1434                              const Array<real_t> &B,
 
 1435                              const Array<real_t> &G,
 
 1436                              const Array<real_t> &Bt,
 
 1437                              const Array<real_t> &Gt,
 
 1444      switch ((D1D << 4 ) | Q1D)
 
 1446         case 0x22: 
return SmemPAConvectionApply2D<2,2,8>(NE,B,G,Bt,Gt,op,x,y);
 
 1447         case 0x33: 
return SmemPAConvectionApply2D<3,3,4>(NE,B,G,Bt,Gt,op,x,y);
 
 1448         case 0x34: 
return SmemPAConvectionApply2D<3,4,4>(NE,B,G,Bt,Gt,op,x,y);
 
 1449         case 0x44: 
return SmemPAConvectionApply2D<4,4,4>(NE,B,G,Bt,Gt,op,x,y);
 
 1450         case 0x46: 
return SmemPAConvectionApply2D<4,6,4>(NE,B,G,Bt,Gt,op,x,y);
 
 1451         case 0x55: 
return SmemPAConvectionApply2D<5,5,2>(NE,B,G,Bt,Gt,op,x,y);
 
 1452         case 0x58: 
return SmemPAConvectionApply2D<5,8,2>(NE,B,G,Bt,Gt,op,x,y);
 
 1453         case 0x66: 
return SmemPAConvectionApply2D<6,6,1>(NE,B,G,Bt,Gt,op,x,y);
 
 1454         case 0x77: 
return SmemPAConvectionApply2D<7,7,1>(NE,B,G,Bt,Gt,op,x,y);
 
 1455         case 0x88: 
return SmemPAConvectionApply2D<8,8,1>(NE,B,G,Bt,Gt,op,x,y);
 
 1456         case 0x99: 
return SmemPAConvectionApply2D<9,9,1>(NE,B,G,Bt,Gt,op,x,y);
 
 1457         default:   
return PAConvectionApply2D(NE,B,G,Bt,Gt,op,x,y,D1D,Q1D);
 
 1462      switch ((D1D << 4 ) | Q1D)
 
 1464         case 0x22: 
return SmemPAConvectionApply3D<2,2>(NE,B,G,Bt,Gt,op,x,y);
 
 1465         case 0x23: 
return SmemPAConvectionApply3D<2,3>(NE,B,G,Bt,Gt,op,x,y);
 
 1466         case 0x24: 
return SmemPAConvectionApply3D<2,4>(NE,B,G,Bt,Gt,op,x,y);
 
 1467         case 0x26: 
return SmemPAConvectionApply3D<2,6>(NE,B,G,Bt,Gt,op,x,y);
 
 1468         case 0x34: 
return SmemPAConvectionApply3D<3,4>(NE,B,G,Bt,Gt,op,x,y);
 
 1469         case 0x35: 
return SmemPAConvectionApply3D<3,5>(NE,B,G,Bt,Gt,op,x,y);
 
 1470         case 0x45: 
return SmemPAConvectionApply3D<4,5>(NE,B,G,Bt,Gt,op,x,y);
 
 1471         case 0x48: 
return SmemPAConvectionApply3D<4,8>(NE,B,G,Bt,Gt,op,x,y);
 
 1472         case 0x56: 
return SmemPAConvectionApply3D<5,6>(NE,B,G,Bt,Gt,op,x,y);
 
 1473         case 0x67: 
return SmemPAConvectionApply3D<6,7>(NE,B,G,Bt,Gt,op,x,y);
 
 1474         case 0x78: 
return SmemPAConvectionApply3D<7,8>(NE,B,G,Bt,Gt,op,x,y);
 
 1475         case 0x89: 
return SmemPAConvectionApply3D<8,9>(NE,B,G,Bt,Gt,op,x,y);
 
 1476         default:   
return PAConvectionApply3D(NE,B,G,Bt,Gt,op,x,y,D1D,Q1D);
 
 1479   MFEM_ABORT(
"Unknown kernel.");
 
 1482static void PAConvectionApplyT(
const int dim,
 
 1486                               const Array<real_t> &B,
 
 1487                               const Array<real_t> &G,
 
 1488                               const Array<real_t> &Bt,
 
 1489                               const Array<real_t> &Gt,
 
 1496      switch ((D1D << 4 ) | Q1D)
 
 1498         case 0x22: 
return SmemPAConvectionApplyT2D<2,2,8>(NE,B,G,Bt,Gt,op,x,y);
 
 1499         case 0x33: 
return SmemPAConvectionApplyT2D<3,3,4>(NE,B,G,Bt,Gt,op,x,y);
 
 1500         case 0x34: 
return SmemPAConvectionApplyT2D<3,4,4>(NE,B,G,Bt,Gt,op,x,y);
 
 1501         case 0x44: 
return SmemPAConvectionApplyT2D<4,4,4>(NE,B,G,Bt,Gt,op,x,y);
 
 1502         case 0x46: 
return SmemPAConvectionApplyT2D<4,6,4>(NE,B,G,Bt,Gt,op,x,y);
 
 1503         case 0x55: 
return SmemPAConvectionApplyT2D<5,5,2>(NE,B,G,Bt,Gt,op,x,y);
 
 1504         case 0x58: 
return SmemPAConvectionApplyT2D<5,8,2>(NE,B,G,Bt,Gt,op,x,y);
 
 1505         case 0x66: 
return SmemPAConvectionApplyT2D<6,6,1>(NE,B,G,Bt,Gt,op,x,y);
 
 1506         case 0x77: 
return SmemPAConvectionApplyT2D<7,7,1>(NE,B,G,Bt,Gt,op,x,y);
 
 1507         case 0x88: 
return SmemPAConvectionApplyT2D<8,8,1>(NE,B,G,Bt,Gt,op,x,y);
 
 1508         case 0x99: 
return SmemPAConvectionApplyT2D<9,9,1>(NE,B,G,Bt,Gt,op,x,y);
 
 1509         default:   
return PAConvectionApplyT2D(NE,B,G,Bt,Gt,op,x,y,D1D,Q1D);
 
 1514      switch ((D1D << 4 ) | Q1D)
 
 1516         case 0x22: 
return SmemPAConvectionApplyT3D<2,2>(NE,B,G,Bt,Gt,op,x,y);
 
 1517         case 0x23: 
return SmemPAConvectionApplyT3D<2,3>(NE,B,G,Bt,Gt,op,x,y);
 
 1518         case 0x24: 
return SmemPAConvectionApplyT3D<2,4>(NE,B,G,Bt,Gt,op,x,y);
 
 1519         case 0x26: 
return SmemPAConvectionApplyT3D<2,6>(NE,B,G,Bt,Gt,op,x,y);
 
 1520         case 0x34: 
return SmemPAConvectionApplyT3D<3,4>(NE,B,G,Bt,Gt,op,x,y);
 
 1521         case 0x35: 
return SmemPAConvectionApplyT3D<3,5>(NE,B,G,Bt,Gt,op,x,y);
 
 1522         case 0x45: 
return SmemPAConvectionApplyT3D<4,5>(NE,B,G,Bt,Gt,op,x,y);
 
 1523         case 0x48: 
return SmemPAConvectionApplyT3D<4,8>(NE,B,G,Bt,Gt,op,x,y);
 
 1524         case 0x56: 
return SmemPAConvectionApplyT3D<5,6>(NE,B,G,Bt,Gt,op,x,y);
 
 1525         case 0x67: 
return SmemPAConvectionApplyT3D<6,7>(NE,B,G,Bt,Gt,op,x,y);
 
 1526         case 0x78: 
return SmemPAConvectionApplyT3D<7,8>(NE,B,G,Bt,Gt,op,x,y);
 
 1527         case 0x89: 
return SmemPAConvectionApplyT3D<8,9>(NE,B,G,Bt,Gt,op,x,y);
 
 1528         default:   
return PAConvectionApplyT3D(NE,B,G,Bt,Gt,op,x,y,D1D,Q1D);
 
 1531   MFEM_ABORT(
"Unknown kernel.");
 
 1552      MFEM_ABORT(
"AddMultPA not yet implemented with libCEED for" 
 1553                 " ConvectionIntegrator.");
 
 
const T * Read(bool on_dev=true) const
Shortcut for mfem::Read(a.GetMemory(), a.Size(), on_dev).
Class to represent a coefficient evaluated at quadrature points.
const DofToQuad * maps
Not owned.
static const IntegrationRule & GetRule(const FiniteElement &el, const ElementTransformation &Trans)
void AssembleDiagonalPA(Vector &diag) override
Assemble diagonal and add it to Vector diag.
void AssemblePA(const FiniteElementSpace &) override
Method defining partial assembly.
void AddMultPA(const Vector &, Vector &) const override
Method for partially assembled action.
const GeometricFactors * geom
Not owned.
void AddMultTransposePA(const Vector &x, Vector &y) const override
Method for partially assembled transposed action.
static MemoryType GetDeviceMemoryType()
Get the current Device MemoryType. This is the MemoryType used by most MFEM classes when allocating m...
Array< real_t > G
Gradients/divergences/curls of basis functions evaluated at quadrature points.
@ TENSOR
Tensor product representation using 1D matrices/tensors with dimensions using 1D number of quadrature...
Array< real_t > B
Basis functions evaluated at quadrature points.
int ndof
Number of degrees of freedom = number of basis functions. When mode is TENSOR, this is the 1D number.
Array< real_t > Gt
Transpose of G.
int nqpt
Number of quadrature points. When mode is TENSOR, this is the 1D number.
Array< real_t > Bt
Transpose of B.
Class FiniteElementSpace - responsible for providing FEM view of the mesh, mainly managing the set of...
bool IsVariableOrder() const
Returns true if the space contains elements of varying polynomial orders.
int GetNE() const
Returns number of elements in the mesh.
Mesh * GetMesh() const
Returns the mesh.
const FiniteElement * GetTypicalFE() const
Return GetFE(0) if the local mesh is not empty; otherwise return a typical FE based on the Geometry t...
Abstract class for all finite elements.
virtual const DofToQuad & GetDofToQuad(const IntegrationRule &ir, DofToQuad::Mode mode) const
Return a DofToQuad structure corresponding to the given IntegrationRule using the given DofToQuad::Mo...
int GetDim() const
Returns the reference space dimension for the finite element.
Vector J
Jacobians of the element transformations at all quadrature points.
Class for an integration rule - an Array of IntegrationPoint.
int GetNPoints() const
Returns the number of the points in the integration rule.
const Array< real_t > & GetWeights() const
Return the quadrature weights in a contiguous array.
const IntegrationRule * IntRule
int Dimension() const
Dimension of the reference space used within the elements.
ElementTransformation * GetTypicalElementTransformation()
If the local mesh is not empty return GetElementTransformation(0); otherwise, return the identity tra...
const GeometricFactors * GetGeometricFactors(const IntegrationRule &ir, const int flags, MemoryType d_mt=MemoryType::DEFAULT)
Return the mesh geometric factors corresponding to the given integration rule.
int GetNumGeometries(int dim) const
Return the number of geometries of the given dimension present in the mesh.
Class representing the storage layout of a QuadratureFunction.
virtual const real_t * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
virtual real_t * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(vec.GetMemory(), vec.Size(), on_dev).
void SetSize(int s)
Resize the vector to size s.
void GetDiagonal(mfem::Vector &diag) const
void AddMult(const mfem::Vector &x, mfem::Vector &y, const real_t a=1.0) const override
Operator application: y+=A(x) (default) or y+=a*A(x).
Represent a ConvectionIntegrator with AssemblyLevel::Partial using libCEED.
const T * Read(const Memory< T > &mem, int size, bool on_dev=true)
Get a pointer for read access to mem with the mfem::Device's DeviceMemoryClass, if on_dev = true,...
real_t u(const Vector &xvec)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
bool DeviceCanUseCeed()
Function that determines if a CEED kernel should be used, based on the current mfem::Device configura...
@ COMPRESSED
Enable all above compressions.
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
MemoryType
Memory types supported by MFEM.
void forall(int N, lambda &&body)
void vel(const Vector &x, real_t t, Vector &u)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.