12#ifndef MFEM_BILININTEG_HCURL_KERNELS_HPP 
   13#define MFEM_BILININTEG_HCURL_KERNELS_HPP 
   32void PAHcurlMassAssembleDiagonal2D(
const int D1D,
 
   36                                   const Array<real_t> &bo,
 
   37                                   const Array<real_t> &bc,
 
   38                                   const Vector &pa_data,
 
   42void PAHcurlMassAssembleDiagonal3D(
const int D1D,
 
   46                                   const Array<real_t> &bo,
 
   47                                   const Array<real_t> &bc,
 
   48                                   const Vector &pa_data,
 
   52template<
int T_D1D = 0, 
int T_Q1D = 0>
 
   53inline void SmemPAHcurlMassAssembleDiagonal3D(
const int d1d,
 
   57                                              const Array<real_t> &bo,
 
   58                                              const Array<real_t> &bc,
 
   59                                              const Vector &pa_data,
 
   63               "Error: d1d > HCURL_MAX_D1D");
 
   65               "Error: q1d > HCURL_MAX_Q1D");
 
   66   const int D1D = T_D1D ? T_D1D : d1d;
 
   67   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
   69   auto Bo = 
Reshape(bo.Read(), Q1D, D1D-1);
 
   70   auto Bc = 
Reshape(bc.Read(), Q1D, D1D);
 
   71   auto op = 
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
 
   72   auto D = 
Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
 
   76      constexpr int VDIM = 3;
 
   77      constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
 
   78      constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
 
   79      const int D1D = T_D1D ? T_D1D : d1d;
 
   80      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
   82      MFEM_SHARED 
real_t sBo[MQ1D][MD1D];
 
   83      MFEM_SHARED 
real_t sBc[MQ1D][MD1D];
 
   86      MFEM_SHARED 
real_t sop[3][MQ1D][MQ1D];
 
   88      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
   90         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
   92            MFEM_FOREACH_THREAD(qz,z,Q1D)
 
   94               op3[0] = op(qx,qy,qz,0,e);
 
   95               op3[1] = op(qx,qy,qz,symmetric ? 3 : 4,e);
 
   96               op3[2] = op(qx,qy,qz,symmetric ? 5 : 8,e);
 
  101      const int tidx = MFEM_THREAD_ID(x);
 
  102      const int tidy = MFEM_THREAD_ID(y);
 
  103      const int tidz = MFEM_THREAD_ID(z);
 
  107         MFEM_FOREACH_THREAD(d,y,D1D)
 
  109            MFEM_FOREACH_THREAD(q,x,Q1D)
 
  122      for (
int c = 0; c < VDIM; ++c)  
 
  124         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
 
  125         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
 
  126         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
 
  130         for (
int qz=0; qz < Q1D; ++qz)
 
  134               for (
int i=0; i<3; ++i)
 
  136                  sop[i][tidx][tidy] = op3[i];
 
  142            MFEM_FOREACH_THREAD(dz,z,D1Dz)
 
  144               const real_t wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
 
  146               MFEM_FOREACH_THREAD(dy,y,D1Dy)
 
  148                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
 
  150                     for (
int qy = 0; qy < Q1D; ++qy)
 
  152                        const real_t wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
 
  154                        for (
int qx = 0; qx < Q1D; ++qx)
 
  156                           const real_t wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
 
  157                           dxyz += sop[c][qx][qy] * wx * wx * wy * wy * wz * wz;
 
  167         MFEM_FOREACH_THREAD(dz,z,D1Dz)
 
  169            MFEM_FOREACH_THREAD(dy,y,D1Dy)
 
  171               MFEM_FOREACH_THREAD(dx,x,D1Dx)
 
  173                  D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
 
  178         osc += D1Dx * D1Dy * D1Dz;
 
  184void PAHcurlMassApply2D(
const int D1D,
 
  187                        const bool symmetric,
 
  188                        const Array<real_t> &bo,
 
  189                        const Array<real_t> &bc,
 
  190                        const Array<real_t> &bot,
 
  191                        const Array<real_t> &bct,
 
  192                        const Vector &pa_data,
 
  197void PAHcurlMassApply3D(
const int D1D,
 
  200                        const bool symmetric,
 
  201                        const Array<real_t> &bo,
 
  202                        const Array<real_t> &bc,
 
  203                        const Array<real_t> &bot,
 
  204                        const Array<real_t> &bct,
 
  205                        const Vector &pa_data,
 
  210template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  211inline void SmemPAHcurlMassApply3D(
const int d1d,
 
  214                                   const bool symmetric,
 
  215                                   const Array<real_t> &bo,
 
  216                                   const Array<real_t> &bc,
 
  217                                   const Array<real_t> &bot,
 
  218                                   const Array<real_t> &bct,
 
  219                                   const Vector &pa_data,
 
  224               "Error: d1d > HCURL_MAX_D1D");
 
  226               "Error: q1d > HCURL_MAX_Q1D");
 
  227   const int D1D = T_D1D ? T_D1D : d1d;
 
  228   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  230   const int dataSize = symmetric ? 6 : 9;
 
  232   auto Bo = 
Reshape(bo.Read(), Q1D, D1D-1);
 
  233   auto Bc = 
Reshape(bc.Read(), Q1D, D1D);
 
  234   auto op = 
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, dataSize, NE);
 
  235   auto X = 
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
 
  236   auto Y = 
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
 
  240      constexpr int VDIM = 3;
 
  241      constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
 
  242      constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
 
  243      const int D1D = T_D1D ? T_D1D : d1d;
 
  244      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  246      MFEM_SHARED 
real_t sBo[MQ1D][MD1D];
 
  247      MFEM_SHARED 
real_t sBc[MQ1D][MD1D];
 
  250      MFEM_SHARED 
real_t sop[9*MQ1D*MQ1D];
 
  251      MFEM_SHARED 
real_t mass[MQ1D][MQ1D][3];
 
  253      MFEM_SHARED 
real_t sX[MD1D][MD1D][MD1D];
 
  255      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  257         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  259            MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  261               for (
int i=0; i<dataSize; ++i)
 
  263                  op9[i] = op(qx,qy,qz,i,e);
 
  269      const int tidx = MFEM_THREAD_ID(x);
 
  270      const int tidy = MFEM_THREAD_ID(y);
 
  271      const int tidz = MFEM_THREAD_ID(z);
 
  275         MFEM_FOREACH_THREAD(d,y,D1D)
 
  277            MFEM_FOREACH_THREAD(q,x,Q1D)
 
  289      for (
int qz=0; qz < Q1D; ++qz)
 
  292         for (
int c = 0; c < VDIM; ++c)  
 
  294            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
 
  295            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
 
  296            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
 
  298            MFEM_FOREACH_THREAD(dz,z,D1Dz)
 
  300               MFEM_FOREACH_THREAD(dy,y,D1Dy)
 
  302                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
 
  304                     sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
 
  312               for (
int i=0; i<dataSize; ++i)
 
  314                  sop[i + (dataSize*tidx) + (dataSize*Q1D*tidy)] = op9[i];
 
  317               MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  319                  MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  323                     for (
int dz = 0; dz < D1Dz; ++dz)
 
  325                        const real_t wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
 
  326                        for (
int dy = 0; dy < D1Dy; ++dy)
 
  328                           const real_t wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
 
  329                           for (
int dx = 0; dx < D1Dx; ++dx)
 
  331                              const real_t t = sX[dz][dy][dx];
 
  332                              const real_t wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
 
  333                              u += t * wx * wy * wz;
 
  343            osc += D1Dx * D1Dy * D1Dz;
 
  350         for (
int c = 0; c < VDIM; ++c)  
 
  352            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
 
  353            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
 
  354            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
 
  358            MFEM_FOREACH_THREAD(dz,z,D1Dz)
 
  360               const real_t wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
 
  362               MFEM_FOREACH_THREAD(dy,y,D1Dy)
 
  364                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
 
  366                     for (
int qy = 0; qy < Q1D; ++qy)
 
  368                        const real_t wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
 
  369                        for (
int qx = 0; qx < Q1D; ++qx)
 
  371                           const int os = (dataSize*qx) + (dataSize*Q1D*qy);
 
  372                           const int id1 = os + ((c == 0) ? 0 : ((c == 1) ? (symmetric ? 1 : 3) :
 
  373                                                                 (symmetric ? 2 : 6))); 
 
  374                           const int id2 = os + ((c == 0) ? 1 : ((c == 1) ? (symmetric ? 3 : 4) :
 
  375                                                                 (symmetric ? 4 : 7))); 
 
  376                           const int id3 = os + ((c == 0) ? 2 : ((c == 1) ? (symmetric ? 4 : 5) :
 
  377                                                                 (symmetric ? 5 : 8))); 
 
  379                           const real_t m_c = (sop[id1] * mass[qy][qx][0]) + (sop[id2] * mass[qy][qx][1]) +
 
  380                                              (sop[id3] * mass[qy][qx][2]);
 
  382                           const real_t wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
 
  383                           dxyz += m_c * wx * wy * wz;
 
  392            MFEM_FOREACH_THREAD(dz,z,D1Dz)
 
  394               MFEM_FOREACH_THREAD(dy,y,D1Dy)
 
  396                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
 
  398                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
 
  403            osc += D1Dx * D1Dy * D1Dz;
 
  410void PACurlCurlSetup2D(
const int Q1D,
 
  412                       const Array<real_t> &w,
 
  418void PACurlCurlSetup3D(
const int Q1D,
 
  421                       const Array<real_t> &w,
 
  427void PACurlCurlAssembleDiagonal2D(
const int D1D,
 
  430                                  const Array<real_t> &bo,
 
  431                                  const Array<real_t> &gc,
 
  432                                  const Vector &pa_data,
 
  436template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  437inline void PACurlCurlAssembleDiagonal3D(
const int d1d,
 
  439                                         const bool symmetric,
 
  441                                         const Array<real_t> &bo,
 
  442                                         const Array<real_t> &bc,
 
  443                                         const Array<real_t> &go,
 
  444                                         const Array<real_t> &gc,
 
  445                                         const Vector &pa_data,
 
  449               "Error: d1d > HCURL_MAX_D1D");
 
  451               "Error: q1d > HCURL_MAX_Q1D");
 
  452   const int D1D = T_D1D ? T_D1D : d1d;
 
  453   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  455   auto Bo = 
Reshape(bo.Read(), Q1D, D1D-1);
 
  456   auto Bc = 
Reshape(bc.Read(), Q1D, D1D);
 
  457   auto Go = 
Reshape(go.Read(), Q1D, D1D-1);
 
  458   auto Gc = 
Reshape(gc.Read(), Q1D, D1D);
 
  459   auto op = 
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
 
  460   auto D = 
Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
 
  462   const int s = symmetric ? 6 : 9;
 
  466   const int i21 = symmetric ? i12 : 3;
 
  467   const int i22 = symmetric ? 3 : 4;
 
  468   const int i23 = symmetric ? 4 : 5;
 
  469   const int i31 = symmetric ? i13 : 6;
 
  470   const int i32 = symmetric ? i23 : 7;
 
  471   const int i33 = symmetric ? 5 : 8;
 
  484      constexpr int VDIM = 3;
 
  485      constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
 
  486      constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
 
  487      const int D1D = T_D1D ? T_D1D : d1d;
 
  488      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  492      for (
int c = 0; c < VDIM; ++c)  
 
  494         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
 
  495         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
 
  496         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
 
  498         real_t zt[MQ1D][MQ1D][MD1D][9][3];
 
  501         for (
int qx = 0; qx < Q1D; ++qx)
 
  503            for (
int qy = 0; qy < Q1D; ++qy)
 
  505               for (
int dz = 0; dz < D1Dz; ++dz)
 
  507                  for (
int i=0; i<s; ++i)
 
  509                     for (
int d=0; d<3; ++d)
 
  511                        zt[qx][qy][dz][i][d] = 0.0;
 
  515                  for (
int qz = 0; qz < Q1D; ++qz)
 
  517                     const real_t wz = ((c == 2) ? Bo(qz,dz) : Bc(qz,dz));
 
  518                     const real_t wDz = ((c == 2) ? Go(qz,dz) : Gc(qz,dz));
 
  520                     for (
int i=0; i<s; ++i)
 
  522                        zt[qx][qy][dz][i][0] += wz * wz * op(qx,qy,qz,i,e);
 
  523                        zt[qx][qy][dz][i][1] += wDz * wz * op(qx,qy,qz,i,e);
 
  524                        zt[qx][qy][dz][i][2] += wDz * wDz * op(qx,qy,qz,i,e);
 
  531         real_t yt[MQ1D][MD1D][MD1D][9][3][3];
 
  534         for (
int qx = 0; qx < Q1D; ++qx)
 
  536            for (
int dz = 0; dz < D1Dz; ++dz)
 
  538               for (
int dy = 0; dy < D1Dy; ++dy)
 
  540                  for (
int i=0; i<s; ++i)
 
  542                     for (
int d=0; d<3; ++d)
 
  543                        for (
int j=0; j<3; ++j)
 
  545                           yt[qx][dy][dz][i][d][j] = 0.0;
 
  549                  for (
int qy = 0; qy < Q1D; ++qy)
 
  551                     const real_t wy = ((c == 1) ? Bo(qy,dy) : Bc(qy,dy));
 
  552                     const real_t wDy = ((c == 1) ? Go(qy,dy) : Gc(qy,dy));
 
  554                     for (
int i=0; i<s; ++i)
 
  556                        for (
int d=0; d<3; ++d)
 
  558                           yt[qx][dy][dz][i][d][0] += wy * wy * zt[qx][qy][dz][i][d];
 
  559                           yt[qx][dy][dz][i][d][1] += wDy * wy * zt[qx][qy][dz][i][d];
 
  560                           yt[qx][dy][dz][i][d][2] += wDy * wDy * zt[qx][qy][dz][i][d];
 
  569         for (
int dz = 0; dz < D1Dz; ++dz)
 
  571            for (
int dy = 0; dy < D1Dy; ++dy)
 
  573               for (
int dx = 0; dx < D1Dx; ++dx)
 
  575                  for (
int qx = 0; qx < Q1D; ++qx)
 
  577                     const real_t wx = ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
 
  578                     const real_t wDx = ((c == 0) ? Go(qx,dx) : Gc(qx,dx));
 
  598                        const real_t sumy = yt[qx][dy][dz][i22][2][0] - yt[qx][dy][dz][i23][1][1]
 
  599                                            - yt[qx][dy][dz][i32][1][1] + yt[qx][dy][dz][i33][0][2];
 
  601                        D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += sumy * wx * wx;
 
  606                        const real_t d = (yt[qx][dy][dz][i11][2][0] * wx * wx)
 
  607                                         - ((yt[qx][dy][dz][i13][1][0] + yt[qx][dy][dz][i31][1][0]) * wDx * wx)
 
  608                                         + (yt[qx][dy][dz][i33][0][0] * wDx * wDx);
 
  610                        D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
 
  615                        const real_t d = (yt[qx][dy][dz][i11][0][2] * wx * wx)
 
  616                                         - ((yt[qx][dy][dz][i12][0][1] + yt[qx][dy][dz][i21][0][1]) * wDx * wx)
 
  617                                         + (yt[qx][dy][dz][i22][0][0] * wDx * wDx);
 
  619                        D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
 
  626         osc += D1Dx * D1Dy * D1Dz;
 
  632template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  633inline void SmemPACurlCurlAssembleDiagonal3D(
const int d1d,
 
  635                                             const bool symmetric,
 
  637                                             const Array<real_t> &bo,
 
  638                                             const Array<real_t> &bc,
 
  639                                             const Array<real_t> &go,
 
  640                                             const Array<real_t> &gc,
 
  641                                             const Vector &pa_data,
 
  645               "Error: d1d > HCURL_MAX_D1D");
 
  647               "Error: q1d > HCURL_MAX_Q1D");
 
  648   const int D1D = T_D1D ? T_D1D : d1d;
 
  649   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  651   auto Bo = 
Reshape(bo.Read(), Q1D, D1D-1);
 
  652   auto Bc = 
Reshape(bc.Read(), Q1D, D1D);
 
  653   auto Go = 
Reshape(go.Read(), Q1D, D1D-1);
 
  654   auto Gc = 
Reshape(gc.Read(), Q1D, D1D);
 
  655   auto op = 
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
 
  656   auto D = 
Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
 
  658   const int s = symmetric ? 6 : 9;
 
  662   const int i21 = symmetric ? i12 : 3;
 
  663   const int i22 = symmetric ? 3 : 4;
 
  664   const int i23 = symmetric ? 4 : 5;
 
  665   const int i31 = symmetric ? i13 : 6;
 
  666   const int i32 = symmetric ? i23 : 7;
 
  667   const int i33 = symmetric ? 5 : 8;
 
  677      constexpr int VDIM = 3;
 
  678      constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
 
  679      constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
 
  680      const int D1D = T_D1D ? T_D1D : d1d;
 
  681      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  683      MFEM_SHARED 
real_t sBo[MQ1D][MD1D];
 
  684      MFEM_SHARED 
real_t sBc[MQ1D][MD1D];
 
  685      MFEM_SHARED 
real_t sGo[MQ1D][MD1D];
 
  686      MFEM_SHARED 
real_t sGc[MQ1D][MD1D];
 
  689      MFEM_SHARED 
real_t sop[9][MQ1D][MQ1D];
 
  691      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  693         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  695            MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  697               for (
int i=0; i<s; ++i)
 
  699                  ope[i] = op(qx,qy,qz,i,e);
 
  705      const int tidx = MFEM_THREAD_ID(x);
 
  706      const int tidy = MFEM_THREAD_ID(y);
 
  707      const int tidz = MFEM_THREAD_ID(z);
 
  711         MFEM_FOREACH_THREAD(d,y,D1D)
 
  713            MFEM_FOREACH_THREAD(q,x,Q1D)
 
  728      for (
int c = 0; c < VDIM; ++c)  
 
  730         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
 
  731         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
 
  732         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
 
  736         for (
int qz=0; qz < Q1D; ++qz)
 
  740               for (
int i=0; i<s; ++i)
 
  742                  sop[i][tidx][tidy] = ope[i];
 
  748            MFEM_FOREACH_THREAD(dz,z,D1Dz)
 
  750               const real_t wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
 
  751               const real_t wDz = ((c == 2) ? sGo[qz][dz] : sGc[qz][dz]);
 
  753               MFEM_FOREACH_THREAD(dy,y,D1Dy)
 
  755                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
 
  757                     for (
int qy = 0; qy < Q1D; ++qy)
 
  759                        const real_t wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
 
  760                        const real_t wDy = ((c == 1) ? sGo[qy][dy] : sGc[qy][dy]);
 
  762                        for (
int qx = 0; qx < Q1D; ++qx)
 
  764                           const real_t wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
 
  765                           const real_t wDx = ((c == 0) ? sGo[qx][dx] : sGc[qx][dx]);
 
  772                              dxyz += sop[i22][qx][qy] * wx * wx * wy * wy * wDz * wDz;
 
  775                              dxyz += -(sop[i23][qx][qy] + sop[i32][qx][qy]) * wx * wx * wDy * wy * wDz * wz;
 
  778                              dxyz += sop[i33][qx][qy] * wx * wx * wDy * wDy * wz * wz;
 
  785                              dxyz += sop[i11][qx][qy] * wx * wx * wy * wy * wDz * wDz;
 
  788                              dxyz += -(sop[i13][qx][qy] + sop[i31][qx][qy]) * wDx * wx * wy * wy * wDz * wz;
 
  791                              dxyz += sop[i33][qx][qy] * wDx * wDx * wy * wy * wz * wz;
 
  798                              dxyz += sop[i11][qx][qy] * wx * wx * wDy * wDy * wz * wz;
 
  801                              dxyz += -(sop[i12][qx][qy] + sop[i21][qx][qy]) * wDx * wx * wDy * wy * wz * wz;
 
  804                              dxyz += sop[i22][qx][qy] * wDx * wDx * wy * wy * wz * wz;
 
  815         MFEM_FOREACH_THREAD(dz,z,D1Dz)
 
  817            MFEM_FOREACH_THREAD(dy,y,D1Dy)
 
  819               MFEM_FOREACH_THREAD(dx,x,D1Dx)
 
  821                  D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
 
  826         osc += D1Dx * D1Dy * D1Dz;
 
  832void PACurlCurlApply2D(
const int D1D,
 
  835                       const Array<real_t> &bo,
 
  836                       const Array<real_t> &bot,
 
  837                       const Array<real_t> &gc,
 
  838                       const Array<real_t> &gct,
 
  839                       const Vector &pa_data,
 
  844template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  845inline void PACurlCurlApply3D(
const int d1d,
 
  847                              const bool symmetric,
 
  849                              const Array<real_t> &bo,
 
  850                              const Array<real_t> &bc,
 
  851                              const Array<real_t> &bot,
 
  852                              const Array<real_t> &bct,
 
  853                              const Array<real_t> &gc,
 
  854                              const Array<real_t> &gct,
 
  855                              const Vector &pa_data,
 
  860               "Error: d1d > HCURL_MAX_D1D");
 
  862               "Error: q1d > HCURL_MAX_Q1D");
 
  863   const int D1D = T_D1D ? T_D1D : d1d;
 
  864   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  866   auto Bo = 
Reshape(bo.Read(), Q1D, D1D-1);
 
  867   auto Bc = 
Reshape(bc.Read(), Q1D, D1D);
 
  868   auto Bot = 
Reshape(bot.Read(), D1D-1, Q1D);
 
  869   auto Bct = 
Reshape(bct.Read(), D1D, Q1D);
 
  870   auto Gc = 
Reshape(gc.Read(), Q1D, D1D);
 
  871   auto Gct = 
Reshape(gct.Read(), D1D, Q1D);
 
  872   auto op = 
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
 
  873   auto X = 
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
 
  874   auto Y = 
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
 
  886      constexpr int VDIM = 3;
 
  887      constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
 
  888      constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
 
  889      const int D1D = T_D1D ? T_D1D : d1d;
 
  890      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  892      real_t curl[MQ1D][MQ1D][MQ1D][VDIM];
 
  895      for (
int qz = 0; qz < Q1D; ++qz)
 
  897         for (
int qy = 0; qy < Q1D; ++qy)
 
  899            for (
int qx = 0; qx < Q1D; ++qx)
 
  901               for (
int c = 0; c < VDIM; ++c)
 
  903                  curl[qz][qy][qx][c] = 0.0;
 
  915         const int D1Dz = D1D;
 
  916         const int D1Dy = D1D;
 
  917         const int D1Dx = D1D - 1;
 
  919         for (
int dz = 0; dz < D1Dz; ++dz)
 
  921            real_t gradXY[MQ1D][MQ1D][2];
 
  922            for (
int qy = 0; qy < Q1D; ++qy)
 
  924               for (
int qx = 0; qx < Q1D; ++qx)
 
  926                  for (
int d = 0; d < 2; ++d)
 
  928                     gradXY[qy][qx][d] = 0.0;
 
  933            for (
int dy = 0; dy < D1Dy; ++dy)
 
  936               for (
int qx = 0; qx < Q1D; ++qx)
 
  941               for (
int dx = 0; dx < D1Dx; ++dx)
 
  943                  const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
 
  944                  for (
int qx = 0; qx < Q1D; ++qx)
 
  946                     massX[qx] += t * Bo(qx,dx);
 
  950               for (
int qy = 0; qy < Q1D; ++qy)
 
  952                  const real_t wy = Bc(qy,dy);
 
  953                  const real_t wDy = Gc(qy,dy);
 
  954                  for (
int qx = 0; qx < Q1D; ++qx)
 
  956                     const real_t wx = massX[qx];
 
  957                     gradXY[qy][qx][0] += wx * wDy;
 
  958                     gradXY[qy][qx][1] += wx * wy;
 
  963            for (
int qz = 0; qz < Q1D; ++qz)
 
  965               const real_t wz = Bc(qz,dz);
 
  966               const real_t wDz = Gc(qz,dz);
 
  967               for (
int qy = 0; qy < Q1D; ++qy)
 
  969                  for (
int qx = 0; qx < Q1D; ++qx)
 
  972                     curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz; 
 
  973                     curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;  
 
  979         osc += D1Dx * D1Dy * D1Dz;
 
  984         const int D1Dz = D1D;
 
  985         const int D1Dy = D1D - 1;
 
  986         const int D1Dx = D1D;
 
  988         for (
int dz = 0; dz < D1Dz; ++dz)
 
  990            real_t gradXY[MQ1D][MQ1D][2];
 
  991            for (
int qy = 0; qy < Q1D; ++qy)
 
  993               for (
int qx = 0; qx < Q1D; ++qx)
 
  995                  for (
int d = 0; d < 2; ++d)
 
  997                     gradXY[qy][qx][d] = 0.0;
 
 1002            for (
int dx = 0; dx < D1Dx; ++dx)
 
 1005               for (
int qy = 0; qy < Q1D; ++qy)
 
 1010               for (
int dy = 0; dy < D1Dy; ++dy)
 
 1012                  const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
 
 1013                  for (
int qy = 0; qy < Q1D; ++qy)
 
 1015                     massY[qy] += t * Bo(qy,dy);
 
 1019               for (
int qx = 0; qx < Q1D; ++qx)
 
 1021                  const real_t wx = Bc(qx,dx);
 
 1022                  const real_t wDx = Gc(qx,dx);
 
 1023                  for (
int qy = 0; qy < Q1D; ++qy)
 
 1025                     const real_t wy = massY[qy];
 
 1026                     gradXY[qy][qx][0] += wDx * wy;
 
 1027                     gradXY[qy][qx][1] += wx * wy;
 
 1032            for (
int qz = 0; qz < Q1D; ++qz)
 
 1034               const real_t wz = Bc(qz,dz);
 
 1035               const real_t wDz = Gc(qz,dz);
 
 1036               for (
int qy = 0; qy < Q1D; ++qy)
 
 1038                  for (
int qx = 0; qx < Q1D; ++qx)
 
 1041                     curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz; 
 
 1042                     curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;  
 
 1048         osc += D1Dx * D1Dy * D1Dz;
 
 1053         const int D1Dz = D1D - 1;
 
 1054         const int D1Dy = D1D;
 
 1055         const int D1Dx = D1D;
 
 1057         for (
int dx = 0; dx < D1Dx; ++dx)
 
 1059            real_t gradYZ[MQ1D][MQ1D][2];
 
 1060            for (
int qz = 0; qz < Q1D; ++qz)
 
 1062               for (
int qy = 0; qy < Q1D; ++qy)
 
 1064                  for (
int d = 0; d < 2; ++d)
 
 1066                     gradYZ[qz][qy][d] = 0.0;
 
 1071            for (
int dy = 0; dy < D1Dy; ++dy)
 
 1074               for (
int qz = 0; qz < Q1D; ++qz)
 
 1079               for (
int dz = 0; dz < D1Dz; ++dz)
 
 1081                  const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
 
 1082                  for (
int qz = 0; qz < Q1D; ++qz)
 
 1084                     massZ[qz] += t * Bo(qz,dz);
 
 1088               for (
int qy = 0; qy < Q1D; ++qy)
 
 1090                  const real_t wy = Bc(qy,dy);
 
 1091                  const real_t wDy = Gc(qy,dy);
 
 1092                  for (
int qz = 0; qz < Q1D; ++qz)
 
 1094                     const real_t wz = massZ[qz];
 
 1095                     gradYZ[qz][qy][0] += wz * wy;
 
 1096                     gradYZ[qz][qy][1] += wz * wDy;
 
 1101            for (
int qx = 0; qx < Q1D; ++qx)
 
 1103               const real_t wx = Bc(qx,dx);
 
 1104               const real_t wDx = Gc(qx,dx);
 
 1106               for (
int qy = 0; qy < Q1D; ++qy)
 
 1108                  for (
int qz = 0; qz < Q1D; ++qz)
 
 1111                     curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx;  
 
 1112                     curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx; 
 
 1120      for (
int qz = 0; qz < Q1D; ++qz)
 
 1122         for (
int qy = 0; qy < Q1D; ++qy)
 
 1124            for (
int qx = 0; qx < Q1D; ++qx)
 
 1126               const real_t O11 = op(qx,qy,qz,0,e);
 
 1127               const real_t O12 = op(qx,qy,qz,1,e);
 
 1128               const real_t O13 = op(qx,qy,qz,2,e);
 
 1129               const real_t O21 = symmetric ? O12 : op(qx,qy,qz,3,e);
 
 1130               const real_t O22 = symmetric ? op(qx,qy,qz,3,e) : op(qx,qy,qz,4,e);
 
 1131               const real_t O23 = symmetric ? op(qx,qy,qz,4,e) : op(qx,qy,qz,5,e);
 
 1132               const real_t O31 = symmetric ? O13 : op(qx,qy,qz,6,e);
 
 1133               const real_t O32 = symmetric ? O23 : op(qx,qy,qz,7,e);
 
 1134               const real_t O33 = symmetric ? op(qx,qy,qz,5,e) : op(qx,qy,qz,8,e);
 
 1136               const real_t c1 = (O11 * curl[qz][qy][qx][0]) + (O12 * curl[qz][qy][qx][1]) +
 
 1137                                 (O13 * curl[qz][qy][qx][2]);
 
 1138               const real_t c2 = (O21 * curl[qz][qy][qx][0]) + (O22 * curl[qz][qy][qx][1]) +
 
 1139                                 (O23 * curl[qz][qy][qx][2]);
 
 1140               const real_t c3 = (O31 * curl[qz][qy][qx][0]) + (O32 * curl[qz][qy][qx][1]) +
 
 1141                                 (O33 * curl[qz][qy][qx][2]);
 
 1143               curl[qz][qy][qx][0] = c1;
 
 1144               curl[qz][qy][qx][1] = c2;
 
 1145               curl[qz][qy][qx][2] = c3;
 
 1153         const int D1Dz = D1D;
 
 1154         const int D1Dy = D1D;
 
 1155         const int D1Dx = D1D - 1;
 
 1157         for (
int qz = 0; qz < Q1D; ++qz)
 
 1159            real_t gradXY12[MD1D][MD1D];
 
 1160            real_t gradXY21[MD1D][MD1D];
 
 1162            for (
int dy = 0; dy < D1Dy; ++dy)
 
 1164               for (
int dx = 0; dx < D1Dx; ++dx)
 
 1166                  gradXY12[dy][dx] = 0.0;
 
 1167                  gradXY21[dy][dx] = 0.0;
 
 1170            for (
int qy = 0; qy < Q1D; ++qy)
 
 1173               for (
int dx = 0; dx < D1Dx; ++dx)
 
 1175                  for (
int n = 0; n < 2; ++n)
 
 1180               for (
int qx = 0; qx < Q1D; ++qx)
 
 1182                  for (
int dx = 0; dx < D1Dx; ++dx)
 
 1184                     const real_t wx = Bot(dx,qx);
 
 1186                     massX[dx][0] += wx * curl[qz][qy][qx][1];
 
 1187                     massX[dx][1] += wx * curl[qz][qy][qx][2];
 
 1190               for (
int dy = 0; dy < D1Dy; ++dy)
 
 1192                  const real_t wy = Bct(dy,qy);
 
 1193                  const real_t wDy = Gct(dy,qy);
 
 1195                  for (
int dx = 0; dx < D1Dx; ++dx)
 
 1197                     gradXY21[dy][dx] += massX[dx][0] * wy;
 
 1198                     gradXY12[dy][dx] += massX[dx][1] * wDy;
 
 1203            for (
int dz = 0; dz < D1Dz; ++dz)
 
 1205               const real_t wz = Bct(dz,qz);
 
 1206               const real_t wDz = Gct(dz,qz);
 
 1207               for (
int dy = 0; dy < D1Dy; ++dy)
 
 1209                  for (
int dx = 0; dx < D1Dx; ++dx)
 
 1213                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
 
 1214                       e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
 
 1220         osc += D1Dx * D1Dy * D1Dz;
 
 1225         const int D1Dz = D1D;
 
 1226         const int D1Dy = D1D - 1;
 
 1227         const int D1Dx = D1D;
 
 1229         for (
int qz = 0; qz < Q1D; ++qz)
 
 1231            real_t gradXY02[MD1D][MD1D];
 
 1232            real_t gradXY20[MD1D][MD1D];
 
 1234            for (
int dy = 0; dy < D1Dy; ++dy)
 
 1236               for (
int dx = 0; dx < D1Dx; ++dx)
 
 1238                  gradXY02[dy][dx] = 0.0;
 
 1239                  gradXY20[dy][dx] = 0.0;
 
 1242            for (
int qx = 0; qx < Q1D; ++qx)
 
 1245               for (
int dy = 0; dy < D1Dy; ++dy)
 
 1250               for (
int qy = 0; qy < Q1D; ++qy)
 
 1252                  for (
int dy = 0; dy < D1Dy; ++dy)
 
 1254                     const real_t wy = Bot(dy,qy);
 
 1256                     massY[dy][0] += wy * curl[qz][qy][qx][2];
 
 1257                     massY[dy][1] += wy * curl[qz][qy][qx][0];
 
 1260               for (
int dx = 0; dx < D1Dx; ++dx)
 
 1262                  const real_t wx = Bct(dx,qx);
 
 1263                  const real_t wDx = Gct(dx,qx);
 
 1265                  for (
int dy = 0; dy < D1Dy; ++dy)
 
 1267                     gradXY02[dy][dx] += massY[dy][0] * wDx;
 
 1268                     gradXY20[dy][dx] += massY[dy][1] * wx;
 
 1273            for (
int dz = 0; dz < D1Dz; ++dz)
 
 1275               const real_t wz = Bct(dz,qz);
 
 1276               const real_t wDz = Gct(dz,qz);
 
 1277               for (
int dy = 0; dy < D1Dy; ++dy)
 
 1279                  for (
int dx = 0; dx < D1Dx; ++dx)
 
 1283                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
 
 1284                       e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
 
 1290         osc += D1Dx * D1Dy * D1Dz;
 
 1295         const int D1Dz = D1D - 1;
 
 1296         const int D1Dy = D1D;
 
 1297         const int D1Dx = D1D;
 
 1299         for (
int qx = 0; qx < Q1D; ++qx)
 
 1301            real_t gradYZ01[MD1D][MD1D];
 
 1302            real_t gradYZ10[MD1D][MD1D];
 
 1304            for (
int dy = 0; dy < D1Dy; ++dy)
 
 1306               for (
int dz = 0; dz < D1Dz; ++dz)
 
 1308                  gradYZ01[dz][dy] = 0.0;
 
 1309                  gradYZ10[dz][dy] = 0.0;
 
 1312            for (
int qy = 0; qy < Q1D; ++qy)
 
 1315               for (
int dz = 0; dz < D1Dz; ++dz)
 
 1317                  for (
int n = 0; n < 2; ++n)
 
 1322               for (
int qz = 0; qz < Q1D; ++qz)
 
 1324                  for (
int dz = 0; dz < D1Dz; ++dz)
 
 1326                     const real_t wz = Bot(dz,qz);
 
 1328                     massZ[dz][0] += wz * curl[qz][qy][qx][0];
 
 1329                     massZ[dz][1] += wz * curl[qz][qy][qx][1];
 
 1332               for (
int dy = 0; dy < D1Dy; ++dy)
 
 1334                  const real_t wy = Bct(dy,qy);
 
 1335                  const real_t wDy = Gct(dy,qy);
 
 1337                  for (
int dz = 0; dz < D1Dz; ++dz)
 
 1339                     gradYZ01[dz][dy] += wy * massZ[dz][1];
 
 1340                     gradYZ10[dz][dy] += wDy * massZ[dz][0];
 
 1345            for (
int dx = 0; dx < D1Dx; ++dx)
 
 1347               const real_t wx = Bct(dx,qx);
 
 1348               const real_t wDx = Gct(dx,qx);
 
 1350               for (
int dy = 0; dy < D1Dy; ++dy)
 
 1352                  for (
int dz = 0; dz < D1Dz; ++dz)
 
 1356                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
 
 1357                       e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
 
 1367template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 1368inline void SmemPACurlCurlApply3D(
const int d1d,
 
 1370                                  const bool symmetric,
 
 1372                                  const Array<real_t> &bo,
 
 1373                                  const Array<real_t> &bc,
 
 1374                                  const Array<real_t> &bot,
 
 1375                                  const Array<real_t> &bct,
 
 1376                                  const Array<real_t> &gc,
 
 1377                                  const Array<real_t> &gct,
 
 1378                                  const Vector &pa_data,
 
 1383               "Error: d1d > HCURL_MAX_D1D");
 
 1385               "Error: q1d > HCURL_MAX_Q1D");
 
 1386   const int D1D = T_D1D ? T_D1D : d1d;
 
 1387   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1395   auto Bo = 
Reshape(bo.Read(), Q1D, D1D-1);
 
 1396   auto Bc = 
Reshape(bc.Read(), Q1D, D1D);
 
 1397   auto Gc = 
Reshape(gc.Read(), Q1D, D1D);
 
 1398   auto op = 
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
 
 1399   auto X = 
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
 
 1400   auto Y = 
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
 
 1402   const int s = symmetric ? 6 : 9;
 
 1404   auto device_kernel = [=] MFEM_DEVICE (
int e)
 
 1406      constexpr int VDIM = 3;
 
 1407      constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
 
 1408      constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
 
 1409      const int D1D = T_D1D ? T_D1D : d1d;
 
 1410      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1412      MFEM_SHARED 
real_t sBo[MD1D][MQ1D];
 
 1413      MFEM_SHARED 
real_t sBc[MD1D][MQ1D];
 
 1414      MFEM_SHARED 
real_t sGc[MD1D][MQ1D];
 
 1417      MFEM_SHARED 
real_t sop[9][MQ1D][MQ1D];
 
 1418      MFEM_SHARED 
real_t curl[MQ1D][MQ1D][3];
 
 1420      MFEM_SHARED 
real_t sX[MD1D][MD1D][MD1D];
 
 1422      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1424         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1426            MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1428               for (
int i=0; i<s; ++i)
 
 1430                  ope[i] = op(qx,qy,qz,i,e);
 
 1436      const int tidx = MFEM_THREAD_ID(x);
 
 1437      const int tidy = MFEM_THREAD_ID(y);
 
 1438      const int tidz = MFEM_THREAD_ID(z);
 
 1442         MFEM_FOREACH_THREAD(d,y,D1D)
 
 1444            MFEM_FOREACH_THREAD(q,x,Q1D)
 
 1446               sBc[d][q] = Bc(q,d);
 
 1447               sGc[d][q] = Gc(q,d);
 
 1450                  sBo[d][q] = Bo(q,d);
 
 1457      for (
int qz=0; qz < Q1D; ++qz)
 
 1461            MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1463               MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1465                  for (
int i=0; i<3; ++i)
 
 1467                     curl[qy][qx][i] = 0.0;
 
 1474         for (
int c = 0; c < VDIM; ++c)  
 
 1476            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
 
 1477            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
 
 1478            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
 
 1480            MFEM_FOREACH_THREAD(dz,z,D1Dz)
 
 1482               MFEM_FOREACH_THREAD(dy,y,D1Dy)
 
 1484                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
 
 1486                     sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
 
 1496                  for (
int i=0; i<s; ++i)
 
 1498                     sop[i][tidx][tidy] = ope[i];
 
 1502               MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1504                  MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1514                        for (
int dz = 0; dz < D1Dz; ++dz)
 
 1516                           const real_t wz = sBc[dz][qz];
 
 1517                           const real_t wDz = sGc[dz][qz];
 
 1519                           for (
int dy = 0; dy < D1Dy; ++dy)
 
 1521                              const real_t wy = sBc[dy][qy];
 
 1522                              const real_t wDy = sGc[dy][qy];
 
 1524                              for (
int dx = 0; dx < D1Dx; ++dx)
 
 1526                                 const real_t wx = sX[dz][dy][dx] * sBo[dx][qx];
 
 1533                        curl[qy][qx][1] += v; 
 
 1534                        curl[qy][qx][2] -= 
u;  
 
 1540                        for (
int dz = 0; dz < D1Dz; ++dz)
 
 1542                           const real_t wz = sBc[dz][qz];
 
 1543                           const real_t wDz = sGc[dz][qz];
 
 1545                           for (
int dy = 0; dy < D1Dy; ++dy)
 
 1547                              const real_t wy = sBo[dy][qy];
 
 1549                              for (
int dx = 0; dx < D1Dx; ++dx)
 
 1551                                 const real_t t = sX[dz][dy][dx];
 
 1552                                 const real_t wx = t * sBc[dx][qx];
 
 1553                                 const real_t wDx = t * sGc[dx][qx];
 
 1561                        curl[qy][qx][0] -= v; 
 
 1562                        curl[qy][qx][2] += 
u; 
 
 1568                        for (
int dz = 0; dz < D1Dz; ++dz)
 
 1570                           const real_t wz = sBo[dz][qz];
 
 1572                           for (
int dy = 0; dy < D1Dy; ++dy)
 
 1574                              const real_t wy = sBc[dy][qy];
 
 1575                              const real_t wDy = sGc[dy][qy];
 
 1577                              for (
int dx = 0; dx < D1Dx; ++dx)
 
 1579                                 const real_t t = sX[dz][dy][dx];
 
 1580                                 const real_t wx = t * sBc[dx][qx];
 
 1581                                 const real_t wDx = t * sGc[dx][qx];
 
 1589                        curl[qy][qx][0] += v; 
 
 1590                        curl[qy][qx][1] -= 
u; 
 
 1596            osc += D1Dx * D1Dy * D1Dz;
 
 1604         MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1606            const real_t wcz = sBc[dz][qz];
 
 1607            const real_t wcDz = sGc[dz][qz];
 
 1608            const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
 
 1610            MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1612               MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1614                  for (
int qy = 0; qy < Q1D; ++qy)
 
 1616                     const real_t wcy = sBc[dy][qy];
 
 1617                     const real_t wcDy = sGc[dy][qy];
 
 1618                     const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
 
 1620                     for (
int qx = 0; qx < Q1D; ++qx)
 
 1622                        const real_t O11 = sop[0][qx][qy];
 
 1623                        const real_t O12 = sop[1][qx][qy];
 
 1624                        const real_t O13 = sop[2][qx][qy];
 
 1625                        const real_t O21 = symmetric ? O12 : sop[3][qx][qy];
 
 1626                        const real_t O22 = symmetric ? sop[3][qx][qy] : sop[4][qx][qy];
 
 1627                        const real_t O23 = symmetric ? sop[4][qx][qy] : sop[5][qx][qy];
 
 1628                        const real_t O31 = symmetric ? O13 : sop[6][qx][qy];
 
 1629                        const real_t O32 = symmetric ? O23 : sop[7][qx][qy];
 
 1630                        const real_t O33 = symmetric ? sop[5][qx][qy] : sop[8][qx][qy];
 
 1632                        const real_t c1 = (O11 * curl[qy][qx][0]) + (O12 * curl[qy][qx][1]) +
 
 1633                                          (O13 * curl[qy][qx][2]);
 
 1634                        const real_t c2 = (O21 * curl[qy][qx][0]) + (O22 * curl[qy][qx][1]) +
 
 1635                                          (O23 * curl[qy][qx][2]);
 
 1636                        const real_t c3 = (O31 * curl[qy][qx][0]) + (O32 * curl[qy][qx][1]) +
 
 1637                                          (O33 * curl[qy][qx][2]);
 
 1639                        const real_t wcx = sBc[dx][qx];
 
 1640                        const real_t wDx = sGc[dx][qx];
 
 1646                           const real_t wx = sBo[dx][qx];
 
 1647                           dxyz1 += (wx * c2 * wcy * wcDz) - (wx * c3 * wcDy * wcz);
 
 1652                        dxyz2 += (-wy * c1 * wcx * wcDz) + (wy * c3 * wDx * wcz);
 
 1656                        dxyz3 += (wcDy * wz * c1 * wcx) - (wcy * wz * c2 * wDx);
 
 1665         MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1667            MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1669               MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1673                     Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
 
 1677                     Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
 
 1681                     Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
 
 1689   auto host_kernel = [&] MFEM_LAMBDA (
int)
 
 1691      MFEM_ABORT_KERNEL(
"This kernel should only be used on GPU.");
 
 1694   ForallWrap<3>(
true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
 
 1698void PAHcurlL2Setup2D(
const int Q1D,
 
 1700                      const Array<real_t> &w,
 
 1705void PAHcurlL2Setup3D(
const int NQ,
 
 1708                      const Array<real_t> &w,
 
 1713void PAHcurlL2Apply2D(
const int D1D,
 
 1717                      const Array<real_t> &bo,
 
 1718                      const Array<real_t> &bot,
 
 1719                      const Array<real_t> &bt,
 
 1720                      const Array<real_t> &gc,
 
 1721                      const Vector &pa_data,
 
 1726void PAHcurlL2ApplyTranspose2D(
const int D1D,
 
 1730                               const Array<real_t> &bo,
 
 1731                               const Array<real_t> &bot,
 
 1732                               const Array<real_t> &
b,
 
 1733                               const Array<real_t> &gct,
 
 1734                               const Vector &pa_data,
 
 1739template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 1740inline void PAHcurlL2Apply3D(
const int d1d,
 
 1744                             const Array<real_t> &bo,
 
 1745                             const Array<real_t> &bc,
 
 1746                             const Array<real_t> &bot,
 
 1747                             const Array<real_t> &bct,
 
 1748                             const Array<real_t> &gc,
 
 1749                             const Vector &pa_data,
 
 1754               "Error: d1d > HCURL_MAX_D1D");
 
 1756               "Error: q1d > HCURL_MAX_Q1D");
 
 1757   const int D1D = T_D1D ? T_D1D : d1d;
 
 1758   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1760   auto Bo = 
Reshape(bo.Read(), Q1D, D1D-1);
 
 1761   auto Bc = 
Reshape(bc.Read(), Q1D, D1D);
 
 1762   auto Bot = 
Reshape(bot.Read(), D1D-1, Q1D);
 
 1763   auto Bct = 
Reshape(bct.Read(), D1D, Q1D);
 
 1764   auto Gc = 
Reshape(gc.Read(), Q1D, D1D);
 
 1765   auto op = 
Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
 
 1766   auto X = 
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
 
 1767   auto Y = 
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
 
 1780      constexpr int VDIM = 3;
 
 1781      constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
 
 1782      constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
 
 1783      const int D1D = T_D1D ? T_D1D : d1d;
 
 1784      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1786      real_t curl[MQ1D][MQ1D][MQ1D][VDIM];
 
 1789      for (
int qz = 0; qz < Q1D; ++qz)
 
 1791         for (
int qy = 0; qy < Q1D; ++qy)
 
 1793            for (
int qx = 0; qx < Q1D; ++qx)
 
 1795               for (
int c = 0; c < VDIM; ++c)
 
 1797                  curl[qz][qy][qx][c] = 0.0;
 
 1809         const int D1Dz = D1D;
 
 1810         const int D1Dy = D1D;
 
 1811         const int D1Dx = D1D - 1;
 
 1813         for (
int dz = 0; dz < D1Dz; ++dz)
 
 1815            real_t gradXY[MQ1D][MQ1D][2];
 
 1816            for (
int qy = 0; qy < Q1D; ++qy)
 
 1818               for (
int qx = 0; qx < Q1D; ++qx)
 
 1820                  for (
int d = 0; d < 2; ++d)
 
 1822                     gradXY[qy][qx][d] = 0.0;
 
 1827            for (
int dy = 0; dy < D1Dy; ++dy)
 
 1830               for (
int qx = 0; qx < Q1D; ++qx)
 
 1835               for (
int dx = 0; dx < D1Dx; ++dx)
 
 1837                  const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
 
 1838                  for (
int qx = 0; qx < Q1D; ++qx)
 
 1840                     massX[qx] += t * Bo(qx,dx);
 
 1844               for (
int qy = 0; qy < Q1D; ++qy)
 
 1846                  const real_t wy = Bc(qy,dy);
 
 1847                  const real_t wDy = Gc(qy,dy);
 
 1848                  for (
int qx = 0; qx < Q1D; ++qx)
 
 1850                     const real_t wx = massX[qx];
 
 1851                     gradXY[qy][qx][0] += wx * wDy;
 
 1852                     gradXY[qy][qx][1] += wx * wy;
 
 1857            for (
int qz = 0; qz < Q1D; ++qz)
 
 1859               const real_t wz = Bc(qz,dz);
 
 1860               const real_t wDz = Gc(qz,dz);
 
 1861               for (
int qy = 0; qy < Q1D; ++qy)
 
 1863                  for (
int qx = 0; qx < Q1D; ++qx)
 
 1866                     curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz; 
 
 1867                     curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;  
 
 1873         osc += D1Dx * D1Dy * D1Dz;
 
 1878         const int D1Dz = D1D;
 
 1879         const int D1Dy = D1D - 1;
 
 1880         const int D1Dx = D1D;
 
 1882         for (
int dz = 0; dz < D1Dz; ++dz)
 
 1884            real_t gradXY[MQ1D][MQ1D][2];
 
 1885            for (
int qy = 0; qy < Q1D; ++qy)
 
 1887               for (
int qx = 0; qx < Q1D; ++qx)
 
 1889                  for (
int d = 0; d < 2; ++d)
 
 1891                     gradXY[qy][qx][d] = 0.0;
 
 1896            for (
int dx = 0; dx < D1Dx; ++dx)
 
 1899               for (
int qy = 0; qy < Q1D; ++qy)
 
 1904               for (
int dy = 0; dy < D1Dy; ++dy)
 
 1906                  const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
 
 1907                  for (
int qy = 0; qy < Q1D; ++qy)
 
 1909                     massY[qy] += t * Bo(qy,dy);
 
 1913               for (
int qx = 0; qx < Q1D; ++qx)
 
 1915                  const real_t wx = Bc(qx,dx);
 
 1916                  const real_t wDx = Gc(qx,dx);
 
 1917                  for (
int qy = 0; qy < Q1D; ++qy)
 
 1919                     const real_t wy = massY[qy];
 
 1920                     gradXY[qy][qx][0] += wDx * wy;
 
 1921                     gradXY[qy][qx][1] += wx * wy;
 
 1926            for (
int qz = 0; qz < Q1D; ++qz)
 
 1928               const real_t wz = Bc(qz,dz);
 
 1929               const real_t wDz = Gc(qz,dz);
 
 1930               for (
int qy = 0; qy < Q1D; ++qy)
 
 1932                  for (
int qx = 0; qx < Q1D; ++qx)
 
 1935                     curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz; 
 
 1936                     curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;  
 
 1942         osc += D1Dx * D1Dy * D1Dz;
 
 1947         const int D1Dz = D1D - 1;
 
 1948         const int D1Dy = D1D;
 
 1949         const int D1Dx = D1D;
 
 1951         for (
int dx = 0; dx < D1Dx; ++dx)
 
 1953            real_t gradYZ[MQ1D][MQ1D][2];
 
 1954            for (
int qz = 0; qz < Q1D; ++qz)
 
 1956               for (
int qy = 0; qy < Q1D; ++qy)
 
 1958                  for (
int d = 0; d < 2; ++d)
 
 1960                     gradYZ[qz][qy][d] = 0.0;
 
 1965            for (
int dy = 0; dy < D1Dy; ++dy)
 
 1968               for (
int qz = 0; qz < Q1D; ++qz)
 
 1973               for (
int dz = 0; dz < D1Dz; ++dz)
 
 1975                  const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
 
 1976                  for (
int qz = 0; qz < Q1D; ++qz)
 
 1978                     massZ[qz] += t * Bo(qz,dz);
 
 1982               for (
int qy = 0; qy < Q1D; ++qy)
 
 1984                  const real_t wy = Bc(qy,dy);
 
 1985                  const real_t wDy = Gc(qy,dy);
 
 1986                  for (
int qz = 0; qz < Q1D; ++qz)
 
 1988                     const real_t wz = massZ[qz];
 
 1989                     gradYZ[qz][qy][0] += wz * wy;
 
 1990                     gradYZ[qz][qy][1] += wz * wDy;
 
 1995            for (
int qx = 0; qx < Q1D; ++qx)
 
 1997               const real_t wx = Bc(qx,dx);
 
 1998               const real_t wDx = Gc(qx,dx);
 
 2000               for (
int qy = 0; qy < Q1D; ++qy)
 
 2002                  for (
int qz = 0; qz < Q1D; ++qz)
 
 2005                     curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx;  
 
 2006                     curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx; 
 
 2014      for (
int qz = 0; qz < Q1D; ++qz)
 
 2016         for (
int qy = 0; qy < Q1D; ++qy)
 
 2018            for (
int qx = 0; qx < Q1D; ++qx)
 
 2020               const real_t O11 = op(0,qx,qy,qz,e);
 
 2023                  for (
int c = 0; c < VDIM; ++c)
 
 2025                     curl[qz][qy][qx][c] *= O11;
 
 2030                  const real_t O21 = op(1,qx,qy,qz,e);
 
 2031                  const real_t O31 = op(2,qx,qy,qz,e);
 
 2032                  const real_t O12 = op(3,qx,qy,qz,e);
 
 2033                  const real_t O22 = op(4,qx,qy,qz,e);
 
 2034                  const real_t O32 = op(5,qx,qy,qz,e);
 
 2035                  const real_t O13 = op(6,qx,qy,qz,e);
 
 2036                  const real_t O23 = op(7,qx,qy,qz,e);
 
 2037                  const real_t O33 = op(8,qx,qy,qz,e);
 
 2038                  const real_t curlX = curl[qz][qy][qx][0];
 
 2039                  const real_t curlY = curl[qz][qy][qx][1];
 
 2040                  const real_t curlZ = curl[qz][qy][qx][2];
 
 2041                  curl[qz][qy][qx][0] = (O11*curlX)+(O12*curlY)+(O13*curlZ);
 
 2042                  curl[qz][qy][qx][1] = (O21*curlX)+(O22*curlY)+(O23*curlZ);
 
 2043                  curl[qz][qy][qx][2] = (O31*curlX)+(O32*curlY)+(O33*curlZ);
 
 2049      for (
int qz = 0; qz < Q1D; ++qz)
 
 2051         real_t massXY[MD1D][MD1D];
 
 2055         for (
int c = 0; c < VDIM; ++c)  
 
 2057            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
 
 2058            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
 
 2059            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
 
 2061            for (
int dy = 0; dy < D1Dy; ++dy)
 
 2063               for (
int dx = 0; dx < D1Dx; ++dx)
 
 2068            for (
int qy = 0; qy < Q1D; ++qy)
 
 2071               for (
int dx = 0; dx < D1Dx; ++dx)
 
 2075               for (
int qx = 0; qx < Q1D; ++qx)
 
 2077                  for (
int dx = 0; dx < D1Dx; ++dx)
 
 2079                     massX[dx] += curl[qz][qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
 
 2083               for (
int dy = 0; dy < D1Dy; ++dy)
 
 2085                  const real_t wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
 
 2086                  for (
int dx = 0; dx < D1Dx; ++dx)
 
 2088                     massXY[dy][dx] += massX[dx] * wy;
 
 2093            for (
int dz = 0; dz < D1Dz; ++dz)
 
 2095               const real_t wz = (c == 2) ? Bot(dz,qz) : Bct(dz,qz);
 
 2096               for (
int dy = 0; dy < D1Dy; ++dy)
 
 2098                  for (
int dx = 0; dx < D1Dx; ++dx)
 
 2100                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += massXY[dy][dx] * wz;
 
 2105            osc += D1Dx * D1Dy * D1Dz;
 
 2112template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 2113inline void SmemPAHcurlL2Apply3D(
const int d1d,
 
 2117                                 const Array<real_t> &bo,
 
 2118                                 const Array<real_t> &bc,
 
 2119                                 const Array<real_t> &gc,
 
 2120                                 const Vector &pa_data,
 
 2125               "Error: d1d > HCURL_MAX_D1D");
 
 2127               "Error: q1d > HCURL_MAX_Q1D");
 
 2128   const int D1D = T_D1D ? T_D1D : d1d;
 
 2129   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 2131   auto Bo = 
Reshape(bo.Read(), Q1D, D1D-1);
 
 2132   auto Bc = 
Reshape(bc.Read(), Q1D, D1D);
 
 2133   auto Gc = 
Reshape(gc.Read(), Q1D, D1D);
 
 2134   auto op = 
Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
 
 2135   auto X = 
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
 
 2136   auto Y = 
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
 
 2138   auto device_kernel = [=] MFEM_DEVICE (
int e)
 
 2140      constexpr int VDIM = 3;
 
 2141      constexpr int maxCoeffDim = 9;
 
 2142      constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
 
 2143      constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
 
 2144      const int D1D = T_D1D ? T_D1D : d1d;
 
 2145      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 2147      MFEM_SHARED 
real_t sBo[MD1D][MQ1D];
 
 2148      MFEM_SHARED 
real_t sBc[MD1D][MQ1D];
 
 2149      MFEM_SHARED 
real_t sGc[MD1D][MQ1D];
 
 2152      MFEM_SHARED 
real_t sop[maxCoeffDim][MQ1D][MQ1D];
 
 2153      MFEM_SHARED 
real_t curl[MQ1D][MQ1D][3];
 
 2155      MFEM_SHARED 
real_t sX[MD1D][MD1D][MD1D];
 
 2157      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 2159         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 2161            MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 2163               for (
int i=0; i<coeffDim; ++i)
 
 2165                  opc[i] = op(i,qx,qy,qz,e);
 
 2171      const int tidx = MFEM_THREAD_ID(x);
 
 2172      const int tidy = MFEM_THREAD_ID(y);
 
 2173      const int tidz = MFEM_THREAD_ID(z);
 
 2177         MFEM_FOREACH_THREAD(d,y,D1D)
 
 2179            MFEM_FOREACH_THREAD(q,x,Q1D)
 
 2181               sBc[d][q] = Bc(q,d);
 
 2182               sGc[d][q] = Gc(q,d);
 
 2185                  sBo[d][q] = Bo(q,d);
 
 2192      for (
int qz=0; qz < Q1D; ++qz)
 
 2196            MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 2198               MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 2200                  for (
int i=0; i<3; ++i)
 
 2202                     curl[qy][qx][i] = 0.0;
 
 2209         for (
int c = 0; c < VDIM; ++c)  
 
 2211            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
 
 2212            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
 
 2213            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
 
 2215            MFEM_FOREACH_THREAD(dz,z,D1Dz)
 
 2217               MFEM_FOREACH_THREAD(dy,y,D1Dy)
 
 2219                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
 
 2221                     sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
 
 2231                  for (
int i=0; i<coeffDim; ++i)
 
 2233                     sop[i][tidx][tidy] = opc[i];
 
 2237               MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 2239                  MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 2249                        for (
int dz = 0; dz < D1Dz; ++dz)
 
 2251                           const real_t wz = sBc[dz][qz];
 
 2252                           const real_t wDz = sGc[dz][qz];
 
 2254                           for (
int dy = 0; dy < D1Dy; ++dy)
 
 2256                              const real_t wy = sBc[dy][qy];
 
 2257                              const real_t wDy = sGc[dy][qy];
 
 2259                              for (
int dx = 0; dx < D1Dx; ++dx)
 
 2261                                 const real_t wx = sX[dz][dy][dx] * sBo[dx][qx];
 
 2268                        curl[qy][qx][1] += v; 
 
 2269                        curl[qy][qx][2] -= 
u;  
 
 2275                        for (
int dz = 0; dz < D1Dz; ++dz)
 
 2277                           const real_t wz = sBc[dz][qz];
 
 2278                           const real_t wDz = sGc[dz][qz];
 
 2280                           for (
int dy = 0; dy < D1Dy; ++dy)
 
 2282                              const real_t wy = sBo[dy][qy];
 
 2284                              for (
int dx = 0; dx < D1Dx; ++dx)
 
 2286                                 const real_t t = sX[dz][dy][dx];
 
 2287                                 const real_t wx = t * sBc[dx][qx];
 
 2288                                 const real_t wDx = t * sGc[dx][qx];
 
 2296                        curl[qy][qx][0] -= v; 
 
 2297                        curl[qy][qx][2] += 
u; 
 
 2303                        for (
int dz = 0; dz < D1Dz; ++dz)
 
 2305                           const real_t wz = sBo[dz][qz];
 
 2307                           for (
int dy = 0; dy < D1Dy; ++dy)
 
 2309                              const real_t wy = sBc[dy][qy];
 
 2310                              const real_t wDy = sGc[dy][qy];
 
 2312                              for (
int dx = 0; dx < D1Dx; ++dx)
 
 2314                                 const real_t t = sX[dz][dy][dx];
 
 2315                                 const real_t wx = t * sBc[dx][qx];
 
 2316                                 const real_t wDx = t * sGc[dx][qx];
 
 2324                        curl[qy][qx][0] += v; 
 
 2325                        curl[qy][qx][1] -= 
u; 
 
 2331            osc += D1Dx * D1Dy * D1Dz;
 
 2339         MFEM_FOREACH_THREAD(dz,z,D1D)
 
 2341            const real_t wcz = sBc[dz][qz];
 
 2342            const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
 
 2344            MFEM_FOREACH_THREAD(dy,y,D1D)
 
 2346               MFEM_FOREACH_THREAD(dx,x,D1D)
 
 2348                  for (
int qy = 0; qy < Q1D; ++qy)
 
 2350                     const real_t wcy = sBc[dy][qy];
 
 2351                     const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
 
 2353                     for (
int qx = 0; qx < Q1D; ++qx)
 
 2355                        const real_t O11 = sop[0][qx][qy];
 
 2359                           c1 = O11 * curl[qy][qx][0];
 
 2360                           c2 = O11 * curl[qy][qx][1];
 
 2361                           c3 = O11 * curl[qy][qx][2];
 
 2365                           const real_t O21 = sop[1][qx][qy];
 
 2366                           const real_t O31 = sop[2][qx][qy];
 
 2367                           const real_t O12 = sop[3][qx][qy];
 
 2368                           const real_t O22 = sop[4][qx][qy];
 
 2369                           const real_t O32 = sop[5][qx][qy];
 
 2370                           const real_t O13 = sop[6][qx][qy];
 
 2371                           const real_t O23 = sop[7][qx][qy];
 
 2372                           const real_t O33 = sop[8][qx][qy];
 
 2373                           c1 = (O11*curl[qy][qx][0])+(O12*curl[qy][qx][1])+(O13*curl[qy][qx][2]);
 
 2374                           c2 = (O21*curl[qy][qx][0])+(O22*curl[qy][qx][1])+(O23*curl[qy][qx][2]);
 
 2375                           c3 = (O31*curl[qy][qx][0])+(O32*curl[qy][qx][1])+(O33*curl[qy][qx][2]);
 
 2378                        const real_t wcx = sBc[dx][qx];
 
 2382                           const real_t wx = sBo[dx][qx];
 
 2383                           dxyz1 += c1 * wx * wcy * wcz;
 
 2386                        dxyz2 += c2 * wcx * wy * wcz;
 
 2387                        dxyz3 += c3 * wcx * wcy * wz;
 
 2396         MFEM_FOREACH_THREAD(dz,z,D1D)
 
 2398            MFEM_FOREACH_THREAD(dy,y,D1D)
 
 2400               MFEM_FOREACH_THREAD(dx,x,D1D)
 
 2404                     Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
 
 2408                     Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
 
 2412                     Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
 
 2420   auto host_kernel = [&] MFEM_LAMBDA (
int)
 
 2422      MFEM_ABORT_KERNEL(
"This kernel should only be used on GPU.");
 
 2425   ForallWrap<3>(
true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
 
 2429template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 2430inline void PAHcurlL2ApplyTranspose3D(
const int d1d,
 
 2434                                      const Array<real_t> &bo,
 
 2435                                      const Array<real_t> &bc,
 
 2436                                      const Array<real_t> &bot,
 
 2437                                      const Array<real_t> &bct,
 
 2438                                      const Array<real_t> &gct,
 
 2439                                      const Vector &pa_data,
 
 2445               "Error: d1d > HCURL_MAX_D1D");
 
 2447               "Error: q1d > HCURL_MAX_Q1D");
 
 2448   const int D1D = T_D1D ? T_D1D : d1d;
 
 2449   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 2451   auto Bo = 
Reshape(bo.Read(), Q1D, D1D-1);
 
 2452   auto Bc = 
Reshape(bc.Read(), Q1D, D1D);
 
 2453   auto Bot = 
Reshape(bot.Read(), D1D-1, Q1D);
 
 2454   auto Bct = 
Reshape(bct.Read(), D1D, Q1D);
 
 2455   auto Gct = 
Reshape(gct.Read(), D1D, Q1D);
 
 2456   auto op = 
Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
 
 2457   auto X = 
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
 
 2458   auto Y = 
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
 
 2462      constexpr int VDIM = 3;
 
 2463      constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
 
 2464      constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
 
 2465      const int D1D = T_D1D ? T_D1D : d1d;
 
 2466      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 2468      real_t mass[MQ1D][MQ1D][MQ1D][VDIM];
 
 2470      for (
int qz = 0; qz < Q1D; ++qz)
 
 2472         for (
int qy = 0; qy < Q1D; ++qy)
 
 2474            for (
int qx = 0; qx < Q1D; ++qx)
 
 2476               for (
int c = 0; c < VDIM; ++c)
 
 2478                  mass[qz][qy][qx][c] = 0.0;
 
 2486      for (
int c = 0; c < VDIM; ++c)  
 
 2488         const int D1Dz = (c == 2) ? D1D - 1 : D1D;
 
 2489         const int D1Dy = (c == 1) ? D1D - 1 : D1D;
 
 2490         const int D1Dx = (c == 0) ? D1D - 1 : D1D;
 
 2492         for (
int dz = 0; dz < D1Dz; ++dz)
 
 2494            real_t massXY[MQ1D][MQ1D];
 
 2495            for (
int qy = 0; qy < Q1D; ++qy)
 
 2497               for (
int qx = 0; qx < Q1D; ++qx)
 
 2499                  massXY[qy][qx] = 0.0;
 
 2503            for (
int dy = 0; dy < D1Dy; ++dy)
 
 2506               for (
int qx = 0; qx < Q1D; ++qx)
 
 2511               for (
int dx = 0; dx < D1Dx; ++dx)
 
 2513                  const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
 
 2514                  for (
int qx = 0; qx < Q1D; ++qx)
 
 2516                     massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
 
 2520               for (
int qy = 0; qy < Q1D; ++qy)
 
 2522                  const real_t wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
 
 2523                  for (
int qx = 0; qx < Q1D; ++qx)
 
 2525                     const real_t wx = massX[qx];
 
 2526                     massXY[qy][qx] += wx * wy;
 
 2531            for (
int qz = 0; qz < Q1D; ++qz)
 
 2533               const real_t wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
 
 2534               for (
int qy = 0; qy < Q1D; ++qy)
 
 2536                  for (
int qx = 0; qx < Q1D; ++qx)
 
 2538                     mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
 
 2544         osc += D1Dx * D1Dy * D1Dz;
 
 2548      for (
int qz = 0; qz < Q1D; ++qz)
 
 2550         for (
int qy = 0; qy < Q1D; ++qy)
 
 2552            for (
int qx = 0; qx < Q1D; ++qx)
 
 2554               const real_t O11 = op(0,qx,qy,qz,e);
 
 2557                  for (
int c = 0; c < VDIM; ++c)
 
 2559                     mass[qz][qy][qx][c] *= O11;
 
 2564                  const real_t O12 = op(1,qx,qy,qz,e);
 
 2565                  const real_t O13 = op(2,qx,qy,qz,e);
 
 2566                  const real_t O21 = op(3,qx,qy,qz,e);
 
 2567                  const real_t O22 = op(4,qx,qy,qz,e);
 
 2568                  const real_t O23 = op(5,qx,qy,qz,e);
 
 2569                  const real_t O31 = op(6,qx,qy,qz,e);
 
 2570                  const real_t O32 = op(7,qx,qy,qz,e);
 
 2571                  const real_t O33 = op(8,qx,qy,qz,e);
 
 2572                  const real_t massX = mass[qz][qy][qx][0];
 
 2573                  const real_t massY = mass[qz][qy][qx][1];
 
 2574                  const real_t massZ = mass[qz][qy][qx][2];
 
 2575                  mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
 
 2576                  mass[qz][qy][qx][1] = (O21*massX)+(O22*massY)+(O23*massZ);
 
 2577                  mass[qz][qy][qx][2] = (O31*massX)+(O32*massY)+(O33*massZ);
 
 2586         const int D1Dz = D1D;
 
 2587         const int D1Dy = D1D;
 
 2588         const int D1Dx = D1D - 1;
 
 2590         for (
int qz = 0; qz < Q1D; ++qz)
 
 2592            real_t gradXY12[MD1D][MD1D];
 
 2593            real_t gradXY21[MD1D][MD1D];
 
 2595            for (
int dy = 0; dy < D1Dy; ++dy)
 
 2597               for (
int dx = 0; dx < D1Dx; ++dx)
 
 2599                  gradXY12[dy][dx] = 0.0;
 
 2600                  gradXY21[dy][dx] = 0.0;
 
 2603            for (
int qy = 0; qy < Q1D; ++qy)
 
 2606               for (
int dx = 0; dx < D1Dx; ++dx)
 
 2608                  for (
int n = 0; n < 2; ++n)
 
 2613               for (
int qx = 0; qx < Q1D; ++qx)
 
 2615                  for (
int dx = 0; dx < D1Dx; ++dx)
 
 2617                     const real_t wx = Bot(dx,qx);
 
 2619                     massX[dx][0] += wx * mass[qz][qy][qx][1];
 
 2620                     massX[dx][1] += wx * mass[qz][qy][qx][2];
 
 2623               for (
int dy = 0; dy < D1Dy; ++dy)
 
 2625                  const real_t wy = Bct(dy,qy);
 
 2626                  const real_t wDy = Gct(dy,qy);
 
 2628                  for (
int dx = 0; dx < D1Dx; ++dx)
 
 2630                     gradXY21[dy][dx] += massX[dx][0] * wy;
 
 2631                     gradXY12[dy][dx] += massX[dx][1] * wDy;
 
 2636            for (
int dz = 0; dz < D1Dz; ++dz)
 
 2638               const real_t wz = Bct(dz,qz);
 
 2639               const real_t wDz = Gct(dz,qz);
 
 2640               for (
int dy = 0; dy < D1Dy; ++dy)
 
 2642                  for (
int dx = 0; dx < D1Dx; ++dx)
 
 2646                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
 
 2647                       e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
 
 2653         osc += D1Dx * D1Dy * D1Dz;
 
 2658         const int D1Dz = D1D;
 
 2659         const int D1Dy = D1D - 1;
 
 2660         const int D1Dx = D1D;
 
 2662         for (
int qz = 0; qz < Q1D; ++qz)
 
 2664            real_t gradXY02[MD1D][MD1D];
 
 2665            real_t gradXY20[MD1D][MD1D];
 
 2667            for (
int dy = 0; dy < D1Dy; ++dy)
 
 2669               for (
int dx = 0; dx < D1Dx; ++dx)
 
 2671                  gradXY02[dy][dx] = 0.0;
 
 2672                  gradXY20[dy][dx] = 0.0;
 
 2675            for (
int qx = 0; qx < Q1D; ++qx)
 
 2678               for (
int dy = 0; dy < D1Dy; ++dy)
 
 2683               for (
int qy = 0; qy < Q1D; ++qy)
 
 2685                  for (
int dy = 0; dy < D1Dy; ++dy)
 
 2687                     const real_t wy = Bot(dy,qy);
 
 2689                     massY[dy][0] += wy * mass[qz][qy][qx][2];
 
 2690                     massY[dy][1] += wy * mass[qz][qy][qx][0];
 
 2693               for (
int dx = 0; dx < D1Dx; ++dx)
 
 2695                  const real_t wx = Bct(dx,qx);
 
 2696                  const real_t wDx = Gct(dx,qx);
 
 2698                  for (
int dy = 0; dy < D1Dy; ++dy)
 
 2700                     gradXY02[dy][dx] += massY[dy][0] * wDx;
 
 2701                     gradXY20[dy][dx] += massY[dy][1] * wx;
 
 2706            for (
int dz = 0; dz < D1Dz; ++dz)
 
 2708               const real_t wz = Bct(dz,qz);
 
 2709               const real_t wDz = Gct(dz,qz);
 
 2710               for (
int dy = 0; dy < D1Dy; ++dy)
 
 2712                  for (
int dx = 0; dx < D1Dx; ++dx)
 
 2716                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
 
 2717                       e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
 
 2723         osc += D1Dx * D1Dy * D1Dz;
 
 2728         const int D1Dz = D1D - 1;
 
 2729         const int D1Dy = D1D;
 
 2730         const int D1Dx = D1D;
 
 2732         for (
int qx = 0; qx < Q1D; ++qx)
 
 2734            real_t gradYZ01[MD1D][MD1D];
 
 2735            real_t gradYZ10[MD1D][MD1D];
 
 2737            for (
int dy = 0; dy < D1Dy; ++dy)
 
 2739               for (
int dz = 0; dz < D1Dz; ++dz)
 
 2741                  gradYZ01[dz][dy] = 0.0;
 
 2742                  gradYZ10[dz][dy] = 0.0;
 
 2745            for (
int qy = 0; qy < Q1D; ++qy)
 
 2748               for (
int dz = 0; dz < D1Dz; ++dz)
 
 2750                  for (
int n = 0; n < 2; ++n)
 
 2755               for (
int qz = 0; qz < Q1D; ++qz)
 
 2757                  for (
int dz = 0; dz < D1Dz; ++dz)
 
 2759                     const real_t wz = Bot(dz,qz);
 
 2761                     massZ[dz][0] += wz * mass[qz][qy][qx][0];
 
 2762                     massZ[dz][1] += wz * mass[qz][qy][qx][1];
 
 2765               for (
int dy = 0; dy < D1Dy; ++dy)
 
 2767                  const real_t wy = Bct(dy,qy);
 
 2768                  const real_t wDy = Gct(dy,qy);
 
 2770                  for (
int dz = 0; dz < D1Dz; ++dz)
 
 2772                     gradYZ01[dz][dy] += wy * massZ[dz][1];
 
 2773                     gradYZ10[dz][dy] += wDy * massZ[dz][0];
 
 2778            for (
int dx = 0; dx < D1Dx; ++dx)
 
 2780               const real_t wx = Bct(dx,qx);
 
 2781               const real_t wDx = Gct(dx,qx);
 
 2783               for (
int dy = 0; dy < D1Dy; ++dy)
 
 2785                  for (
int dz = 0; dz < D1Dz; ++dz)
 
 2789                     Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
 
 2790                       e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
 
 2800template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 2801inline void SmemPAHcurlL2ApplyTranspose3D(
const int d1d,
 
 2805                                          const Array<real_t> &bo,
 
 2806                                          const Array<real_t> &bc,
 
 2807                                          const Array<real_t> &gc,
 
 2808                                          const Vector &pa_data,
 
 2813               "Error: d1d > HCURL_MAX_D1D");
 
 2815               "Error: q1d > HCURL_MAX_Q1D");
 
 2816   const int D1D = T_D1D ? T_D1D : d1d;
 
 2817   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 2819   auto Bo = 
Reshape(bo.Read(), Q1D, D1D-1);
 
 2820   auto Bc = 
Reshape(bc.Read(), Q1D, D1D);
 
 2821   auto Gc = 
Reshape(gc.Read(), Q1D, D1D);
 
 2822   auto op = 
Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
 
 2823   auto X = 
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
 
 2824   auto Y = 
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
 
 2826   auto device_kernel = [=] MFEM_DEVICE (
int e)
 
 2828      constexpr int VDIM = 3;
 
 2829      constexpr int maxCoeffDim = 9;
 
 2830      constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
 
 2831      constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
 
 2832      const int D1D = T_D1D ? T_D1D : d1d;
 
 2833      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 2835      MFEM_SHARED 
real_t sBo[MD1D][MQ1D];
 
 2836      MFEM_SHARED 
real_t sBc[MD1D][MQ1D];
 
 2837      MFEM_SHARED 
real_t sGc[MD1D][MQ1D];
 
 2840      MFEM_SHARED 
real_t sop[maxCoeffDim][MQ1D][MQ1D];
 
 2841      MFEM_SHARED 
real_t mass[MQ1D][MQ1D][3];
 
 2843      MFEM_SHARED 
real_t sX[MD1D][MD1D][MD1D];
 
 2845      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 2847         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 2849            MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 2851               for (
int i=0; i<coeffDim; ++i)
 
 2853                  opc[i] = op(i,qx,qy,qz,e);
 
 2859      const int tidx = MFEM_THREAD_ID(x);
 
 2860      const int tidy = MFEM_THREAD_ID(y);
 
 2861      const int tidz = MFEM_THREAD_ID(z);
 
 2865         MFEM_FOREACH_THREAD(d,y,D1D)
 
 2867            MFEM_FOREACH_THREAD(q,x,Q1D)
 
 2869               sBc[d][q] = Bc(q,d);
 
 2870               sGc[d][q] = Gc(q,d);
 
 2873                  sBo[d][q] = Bo(q,d);
 
 2880      for (
int qz=0; qz < Q1D; ++qz)
 
 2884            MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 2886               MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 2888                  for (
int i=0; i<3; ++i)
 
 2890                     mass[qy][qx][i] = 0.0;
 
 2897         for (
int c = 0; c < VDIM; ++c)  
 
 2899            const int D1Dz = (c == 2) ? D1D - 1 : D1D;
 
 2900            const int D1Dy = (c == 1) ? D1D - 1 : D1D;
 
 2901            const int D1Dx = (c == 0) ? D1D - 1 : D1D;
 
 2903            MFEM_FOREACH_THREAD(dz,z,D1Dz)
 
 2905               MFEM_FOREACH_THREAD(dy,y,D1Dy)
 
 2907                  MFEM_FOREACH_THREAD(dx,x,D1Dx)
 
 2909                     sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
 
 2919                  for (
int i=0; i<coeffDim; ++i)
 
 2921                     sop[i][tidx][tidy] = opc[i];
 
 2925               MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 2927                  MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 2931                     for (
int dz = 0; dz < D1Dz; ++dz)
 
 2933                        const real_t wz = (c == 2) ? sBo[dz][qz] : sBc[dz][qz];
 
 2935                        for (
int dy = 0; dy < D1Dy; ++dy)
 
 2937                           const real_t wy = (c == 1) ? sBo[dy][qy] : sBc[dy][qy];
 
 2939                           for (
int dx = 0; dx < D1Dx; ++dx)
 
 2941                              const real_t wx = sX[dz][dy][dx] * ((c == 0) ? sBo[dx][qx] : sBc[dx][qx]);
 
 2947                     mass[qy][qx][c] += 
u;
 
 2952            osc += D1Dx * D1Dy * D1Dz;
 
 2960         MFEM_FOREACH_THREAD(dz,z,D1D)
 
 2962            const real_t wcz = sBc[dz][qz];
 
 2963            const real_t wcDz = sGc[dz][qz];
 
 2964            const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
 
 2966            MFEM_FOREACH_THREAD(dy,y,D1D)
 
 2968               MFEM_FOREACH_THREAD(dx,x,D1D)
 
 2970                  for (
int qy = 0; qy < Q1D; ++qy)
 
 2972                     const real_t wcy = sBc[dy][qy];
 
 2973                     const real_t wcDy = sGc[dy][qy];
 
 2974                     const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
 
 2976                     for (
int qx = 0; qx < Q1D; ++qx)
 
 2978                        const real_t O11 = sop[0][qx][qy];
 
 2982                           c1 = O11 * mass[qy][qx][0];
 
 2983                           c2 = O11 * mass[qy][qx][1];
 
 2984                           c3 = O11 * mass[qy][qx][2];
 
 2988                           const real_t O12 = sop[1][qx][qy];
 
 2989                           const real_t O13 = sop[2][qx][qy];
 
 2990                           const real_t O21 = sop[3][qx][qy];
 
 2991                           const real_t O22 = sop[4][qx][qy];
 
 2992                           const real_t O23 = sop[5][qx][qy];
 
 2993                           const real_t O31 = sop[6][qx][qy];
 
 2994                           const real_t O32 = sop[7][qx][qy];
 
 2995                           const real_t O33 = sop[8][qx][qy];
 
 2997                           c1 = (O11*mass[qy][qx][0])+(O12*mass[qy][qx][1])+(O13*mass[qy][qx][2]);
 
 2998                           c2 = (O21*mass[qy][qx][0])+(O22*mass[qy][qx][1])+(O23*mass[qy][qx][2]);
 
 2999                           c3 = (O31*mass[qy][qx][0])+(O32*mass[qy][qx][1])+(O33*mass[qy][qx][2]);
 
 3002                        const real_t wcx = sBc[dx][qx];
 
 3003                        const real_t wDx = sGc[dx][qx];
 
 3007                           const real_t wx = sBo[dx][qx];
 
 3008                           dxyz1 += (wx * c2 * wcy * wcDz) - (wx * c3 * wcDy * wcz);
 
 3011                        dxyz2 += (-wy * c1 * wcx * wcDz) + (wy * c3 * wDx * wcz);
 
 3013                        dxyz3 += (wcDy * wz * c1 * wcx) - (wcy * wz * c2 * wDx);
 
 3022         MFEM_FOREACH_THREAD(dz,z,D1D)
 
 3024            MFEM_FOREACH_THREAD(dy,y,D1D)
 
 3026               MFEM_FOREACH_THREAD(dx,x,D1D)
 
 3030                     Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
 
 3034                     Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
 
 3038                     Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
 
 3046   auto host_kernel = [&] MFEM_LAMBDA (
int)
 
 3048      MFEM_ABORT_KERNEL(
"This kernel should only be used on GPU.");
 
 3051   ForallWrap<3>(
true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
 
real_t u(const Vector &xvec)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
void forall(int N, lambda &&body)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.