12#ifndef MFEM_BILININTEG_DIFFUSION_KERNELS_HPP 
   13#define MFEM_BILININTEG_DIFFUSION_KERNELS_HPP 
   30void PADiffusionSetup(
const int dim,
 
   36                      const Array<real_t> &W,
 
   43void PADiffusionSetup2D(
const int Q1D,
 
   46                        const Array<real_t> &w,
 
   52void PADiffusionSetup3D(
const int Q1D,
 
   55                        const Array<real_t> &w,
 
   62void OccaPADiffusionSetup2D(
const int D1D,
 
   65                            const Array<real_t> &W,
 
   71void OccaPADiffusionSetup3D(
const int D1D,
 
   74                            const Array<real_t> &W,
 
   80void PADiffusionAssembleDiagonal(
const int dim,
 
   85                                 const Array<real_t> &B,
 
   86                                 const Array<real_t> &G,
 
   91template<
int T_D1D = 0, 
int T_Q1D = 0>
 
   92inline void PADiffusionDiagonal2D(
const int NE,
 
   94                                  const Array<real_t> &
b,
 
   95                                  const Array<real_t> &g,
 
  101   const int D1D = T_D1D ? T_D1D : d1d;
 
  102   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  105   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  106   auto G = 
Reshape(g.Read(), Q1D, D1D);
 
  109   auto D = 
Reshape(d.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
 
  110   auto Y = 
Reshape(y.ReadWrite(), D1D, D1D, NE);
 
  113      const int D1D = T_D1D ? T_D1D : d1d;
 
  114      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  115      constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  116      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  121      for (
int qx = 0; qx < Q1D; ++qx)
 
  123         for (
int dy = 0; dy < D1D; ++dy)
 
  128            for (
int qy = 0; qy < Q1D; ++qy)
 
  130               const int q = qx + qy * Q1D;
 
  131               const real_t D00 = D(q,0,e);
 
  132               const real_t D10 = D(q,1,e);
 
  133               const real_t D01 = symmetric ? D10 : D(q,2,e);
 
  134               const real_t D11 = symmetric ? D(q,2,e) : D(q,3,e);
 
  135               QD0[qx][dy] += B(qy, dy) * B(qy, dy) * D00;
 
  136               QD1[qx][dy] += B(qy, dy) * G(qy, dy) * (D01 + D10);
 
  137               QD2[qx][dy] += G(qy, dy) * G(qy, dy) * D11;
 
  141      for (
int dy = 0; dy < D1D; ++dy)
 
  143         for (
int dx = 0; dx < D1D; ++dx)
 
  145            for (
int qx = 0; qx < Q1D; ++qx)
 
  147               Y(dx,dy,e) += G(qx, dx) * G(qx, dx) * QD0[qx][dy];
 
  148               Y(dx,dy,e) += G(qx, dx) * B(qx, dx) * QD1[qx][dy];
 
  149               Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD2[qx][dy];
 
  158constexpr int ipow(
int x, 
int p) { 
return p == 0 ? 1 : x*ipow(x, 
p-1); }
 
  159constexpr int D11(
int x) { 
return (11 - x)/2; }
 
  160constexpr int D10(
int x) { 
return (10 - x)/2; }
 
  161constexpr int NBZApply(
int D1D)
 
  163   return ipow(2, D11(D1D) >= 0 ? D11(D1D) : 0);
 
  165constexpr int NBZDiagonal(
int D1D)
 
  167   return ipow(2, D10(D1D) >= 0 ? D10(D1D) : 0);
 
  172template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  173inline void SmemPADiffusionDiagonal2D(
const int NE,
 
  174                                      const bool symmetric,
 
  175                                      const Array<real_t> &b_,
 
  176                                      const Array<real_t> &g_,
 
  182   static constexpr int T_NBZ = diffusion::NBZDiagonal(T_D1D);
 
  183   static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
  184   const int D1D = T_D1D ? T_D1D : d1d;
 
  185   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  188   MFEM_VERIFY(D1D <= max_d1d, 
"");
 
  189   MFEM_VERIFY(Q1D <= max_q1d, 
"");
 
  190   auto b = 
Reshape(b_.Read(), Q1D, D1D);
 
  191   auto g = 
Reshape(g_.Read(), Q1D, D1D);
 
  192   auto D = 
Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
 
  193   auto Y = 
Reshape(y_.ReadWrite(), D1D, D1D, NE);
 
  196      const int tidz = MFEM_THREAD_ID(z);
 
  197      const int D1D = T_D1D ? T_D1D : d1d;
 
  198      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  199      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  200      constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  201      MFEM_SHARED 
real_t BG[2][MQ1*MD1];
 
  204      MFEM_SHARED 
real_t QD[3][NBZ][MD1][MQ1];
 
  210         MFEM_FOREACH_THREAD(d,y,D1D)
 
  212            MFEM_FOREACH_THREAD(q,x,Q1D)
 
  220      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  222         MFEM_FOREACH_THREAD(dy,y,D1D)
 
  227            for (
int qy = 0; qy < Q1D; ++qy)
 
  229               const int q = qx + qy * Q1D;
 
  230               const real_t D00 = D(q,0,e);
 
  231               const real_t D10 = D(q,1,e);
 
  232               const real_t D01 = symmetric ? D10 : D(q,2,e);
 
  233               const real_t D11 = symmetric ? D(q,2,e) : D(q,3,e);
 
  234               const real_t By = B[qy][dy];
 
  235               const real_t Gy = G[qy][dy];
 
  236               const real_t BBy = By * By;
 
  237               const real_t BGy = By * Gy;
 
  238               const real_t GGy = Gy * Gy;
 
  239               QD0[qx][dy] += BBy * D00;
 
  240               QD1[qx][dy] += BGy * (D01 + D10);
 
  241               QD2[qx][dy] += GGy * D11;
 
  246      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  248         MFEM_FOREACH_THREAD(dx,x,D1D)
 
  250            for (
int qx = 0; qx < Q1D; ++qx)
 
  252               const real_t Bx = B[qx][dx];
 
  253               const real_t Gx = G[qx][dx];
 
  254               const real_t BBx = Bx * Bx;
 
  255               const real_t BGx = Bx * Gx;
 
  256               const real_t GGx = Gx * Gx;
 
  257               Y(dx,dy,e) += GGx * QD0[qx][dy];
 
  258               Y(dx,dy,e) += BGx * QD1[qx][dy];
 
  259               Y(dx,dy,e) += BBx * QD2[qx][dy];
 
  267template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  268inline void PADiffusionDiagonal3D(
const int NE,
 
  269                                  const bool symmetric,
 
  270                                  const Array<real_t> &
b,
 
  271                                  const Array<real_t> &g,
 
  277   constexpr int DIM = 3;
 
  278   const int D1D = T_D1D ? T_D1D : d1d;
 
  279   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  282   MFEM_VERIFY(D1D <= max_d1d, 
"");
 
  283   MFEM_VERIFY(Q1D <= max_q1d, 
"");
 
  284   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  285   auto G = 
Reshape(g.Read(), Q1D, D1D);
 
  286   auto Q = 
Reshape(d.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
 
  287   auto Y = 
Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
 
  290      const int D1D = T_D1D ? T_D1D : d1d;
 
  291      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  292      constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  293      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  294      real_t QQD[MQ1][MQ1][MD1];
 
  295      real_t QDD[MQ1][MD1][MD1];
 
  296      for (
int i = 0; i < 
DIM; ++i)
 
  298         for (
int j = 0; j < 
DIM; ++j)
 
  301            for (
int qx = 0; qx < Q1D; ++qx)
 
  303               for (
int qy = 0; qy < Q1D; ++qy)
 
  305                  for (
int dz = 0; dz < D1D; ++dz)
 
  307                     QQD[qx][qy][dz] = 0.0;
 
  308                     for (
int qz = 0; qz < Q1D; ++qz)
 
  310                        const int q = qx + (qy + qz * Q1D) * Q1D;
 
  311                        const int ksym = j >= i ?
 
  312                                         3 - (3-i)*(2-i)/2 + j:
 
  313                                         3 - (3-j)*(2-j)/2 + i;
 
  314                        const int k = symmetric ? ksym : (i*
DIM) + j;
 
  315                        const real_t O = Q(q,k,e);
 
  316                        const real_t Bz = B(qz,dz);
 
  317                        const real_t Gz = G(qz,dz);
 
  318                        const real_t L = i==2 ? Gz : Bz;
 
  319                        const real_t R = j==2 ? Gz : Bz;
 
  320                        QQD[qx][qy][dz] += L * O * R;
 
  326            for (
int qx = 0; qx < Q1D; ++qx)
 
  328               for (
int dz = 0; dz < D1D; ++dz)
 
  330                  for (
int dy = 0; dy < D1D; ++dy)
 
  332                     QDD[qx][dy][dz] = 0.0;
 
  333                     for (
int qy = 0; qy < Q1D; ++qy)
 
  335                        const real_t By = B(qy,dy);
 
  336                        const real_t Gy = G(qy,dy);
 
  337                        const real_t L = i==1 ? Gy : By;
 
  338                        const real_t R = j==1 ? Gy : By;
 
  339                        QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
 
  345            for (
int dz = 0; dz < D1D; ++dz)
 
  347               for (
int dy = 0; dy < D1D; ++dy)
 
  349                  for (
int dx = 0; dx < D1D; ++dx)
 
  351                     for (
int qx = 0; qx < Q1D; ++qx)
 
  353                        const real_t Bx = B(qx,dx);
 
  354                        const real_t Gx = G(qx,dx);
 
  355                        const real_t L = i==0 ? Gx : Bx;
 
  356                        const real_t R = j==0 ? Gx : Bx;
 
  357                        Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
 
  368template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  369inline void SmemPADiffusionDiagonal3D(
const int NE,
 
  370                                      const bool symmetric,
 
  371                                      const Array<real_t> &b_,
 
  372                                      const Array<real_t> &g_,
 
  378   constexpr int DIM = 3;
 
  379   const int D1D = T_D1D ? T_D1D : d1d;
 
  380   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  383   MFEM_VERIFY(D1D <= max_d1d, 
"");
 
  384   MFEM_VERIFY(Q1D <= max_q1d, 
"");
 
  385   auto b = 
Reshape(b_.Read(), Q1D, D1D);
 
  386   auto g = 
Reshape(g_.Read(), Q1D, D1D);
 
  387   auto D = 
Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
 
  388   auto Y = 
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
 
  391      const int tidz = MFEM_THREAD_ID(z);
 
  392      const int D1D = T_D1D ? T_D1D : d1d;
 
  393      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  394      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  395      constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  396      MFEM_SHARED 
real_t BG[2][MQ1*MD1];
 
  399      MFEM_SHARED 
real_t QQD[MQ1][MQ1][MD1];
 
  400      MFEM_SHARED 
real_t QDD[MQ1][MD1][MD1];
 
  403         MFEM_FOREACH_THREAD(d,y,D1D)
 
  405            MFEM_FOREACH_THREAD(q,x,Q1D)
 
  413      for (
int i = 0; i < 
DIM; ++i)
 
  415         for (
int j = 0; j < 
DIM; ++j)
 
  418            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  420               MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  422                  MFEM_FOREACH_THREAD(dz,z,D1D)
 
  424                     QQD[qx][qy][dz] = 0.0;
 
  425                     for (
int qz = 0; qz < Q1D; ++qz)
 
  427                        const int q = qx + (qy + qz * Q1D) * Q1D;
 
  428                        const int ksym = j >= i ?
 
  429                                         3 - (3-i)*(2-i)/2 + j:
 
  430                                         3 - (3-j)*(2-j)/2 + i;
 
  431                        const int k = symmetric ? ksym : (i*
DIM) + j;
 
  432                        const real_t O = D(q,k,e);
 
  433                        const real_t Bz = B[qz][dz];
 
  434                        const real_t Gz = G[qz][dz];
 
  435                        const real_t L = i==2 ? Gz : Bz;
 
  436                        const real_t R = j==2 ? Gz : Bz;
 
  437                        QQD[qx][qy][dz] += L * O * R;
 
  444            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  446               MFEM_FOREACH_THREAD(dz,z,D1D)
 
  448                  MFEM_FOREACH_THREAD(dy,y,D1D)
 
  450                     QDD[qx][dy][dz] = 0.0;
 
  451                     for (
int qy = 0; qy < Q1D; ++qy)
 
  453                        const real_t By = B[qy][dy];
 
  454                        const real_t Gy = G[qy][dy];
 
  455                        const real_t L = i==1 ? Gy : By;
 
  456                        const real_t R = j==1 ? Gy : By;
 
  457                        QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
 
  464            MFEM_FOREACH_THREAD(dz,z,D1D)
 
  466               MFEM_FOREACH_THREAD(dy,y,D1D)
 
  468                  MFEM_FOREACH_THREAD(dx,x,D1D)
 
  470                     for (
int qx = 0; qx < Q1D; ++qx)
 
  472                        const real_t Bx = B[qx][dx];
 
  473                        const real_t Gx = G[qx][dx];
 
  474                        const real_t L = i==0 ? Gx : Bx;
 
  475                        const real_t R = j==0 ? Gx : Bx;
 
  476                        Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
 
  486void PADiffusionApply(
const int dim,
 
  491                      const Array<real_t> &B,
 
  492                      const Array<real_t> &G,
 
  493                      const Array<real_t> &Bt,
 
  494                      const Array<real_t> &Gt,
 
  501void OccaPADiffusionApply2D(
const int D1D,
 
  504                            const Array<real_t> &B,
 
  505                            const Array<real_t> &G,
 
  506                            const Array<real_t> &Bt,
 
  507                            const Array<real_t> &Gt,
 
  513void OccaPADiffusionApply3D(
const int D1D,
 
  516                            const Array<real_t> &B,
 
  517                            const Array<real_t> &G,
 
  518                            const Array<real_t> &Bt,
 
  519                            const Array<real_t> &Gt,
 
  526template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  527inline void PADiffusionApply2D(
const int NE,
 
  528                               const bool symmetric,
 
  529                               const Array<real_t> &b_,
 
  530                               const Array<real_t> &g_,
 
  531                               const Array<real_t> &bt_,
 
  532                               const Array<real_t> >_,
 
  539   const int D1D = T_D1D ? T_D1D : d1d;
 
  540   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  543   auto B = 
Reshape(b_.Read(), Q1D, D1D);
 
  544   auto G = 
Reshape(g_.Read(), Q1D, D1D);
 
  545   auto Bt = 
Reshape(bt_.Read(), D1D, Q1D);
 
  546   auto Gt = 
Reshape(gt_.Read(), D1D, Q1D);
 
  547   auto D = 
Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
 
  548   auto X = 
Reshape(x_.Read(), D1D, D1D, NE);
 
  549   auto Y = 
Reshape(y_.ReadWrite(), D1D, D1D, NE);
 
  552      const int D1D = T_D1D ? T_D1D : d1d;
 
  553      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  555      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  556      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  558      real_t grad[max_Q1D][max_Q1D][2];
 
  559      for (
int qy = 0; qy < Q1D; ++qy)
 
  561         for (
int qx = 0; qx < Q1D; ++qx)
 
  563            grad[qy][qx][0] = 0.0;
 
  564            grad[qy][qx][1] = 0.0;
 
  567      for (
int dy = 0; dy < D1D; ++dy)
 
  570         for (
int qx = 0; qx < Q1D; ++qx)
 
  575         for (
int dx = 0; dx < D1D; ++dx)
 
  577            const real_t s = X(dx,dy,e);
 
  578            for (
int qx = 0; qx < Q1D; ++qx)
 
  580               gradX[qx][0] += s * B(qx,dx);
 
  581               gradX[qx][1] += s * G(qx,dx);
 
  584         for (
int qy = 0; qy < Q1D; ++qy)
 
  586            const real_t wy  = B(qy,dy);
 
  587            const real_t wDy = G(qy,dy);
 
  588            for (
int qx = 0; qx < Q1D; ++qx)
 
  590               grad[qy][qx][0] += gradX[qx][1] * wy;
 
  591               grad[qy][qx][1] += gradX[qx][0] * wDy;
 
  596      for (
int qy = 0; qy < Q1D; ++qy)
 
  598         for (
int qx = 0; qx < Q1D; ++qx)
 
  600            const int q = qx + qy * Q1D;
 
  602            const real_t O11 = D(q,0,e);
 
  603            const real_t O21 = D(q,1,e);
 
  604            const real_t O12 = symmetric ? O21 : D(q,2,e);
 
  605            const real_t O22 = symmetric ? D(q,2,e) : D(q,3,e);
 
  607            const real_t gradX = grad[qy][qx][0];
 
  608            const real_t gradY = grad[qy][qx][1];
 
  610            grad[qy][qx][0] = (O11 * gradX) + (O12 * gradY);
 
  611            grad[qy][qx][1] = (O21 * gradX) + (O22 * gradY);
 
  614      for (
int qy = 0; qy < Q1D; ++qy)
 
  617         for (
int dx = 0; dx < D1D; ++dx)
 
  622         for (
int qx = 0; qx < Q1D; ++qx)
 
  624            const real_t gX = grad[qy][qx][0];
 
  625            const real_t gY = grad[qy][qx][1];
 
  626            for (
int dx = 0; dx < D1D; ++dx)
 
  628               const real_t wx  = Bt(dx,qx);
 
  629               const real_t wDx = Gt(dx,qx);
 
  630               gradX[dx][0] += gX * wDx;
 
  631               gradX[dx][1] += gY * wx;
 
  634         for (
int dy = 0; dy < D1D; ++dy)
 
  636            const real_t wy  = Bt(dy,qy);
 
  637            const real_t wDy = Gt(dy,qy);
 
  638            for (
int dx = 0; dx < D1D; ++dx)
 
  640               Y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
 
  648template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  649inline void SmemPADiffusionApply2D(
const int NE,
 
  650                                   const bool symmetric,
 
  651                                   const Array<real_t> &b_,
 
  652                                   const Array<real_t> &g_,
 
  653                                   const Array<real_t> &bt_,
 
  654                                   const Array<real_t> >_,
 
  661   static constexpr int T_NBZ = diffusion::NBZApply(T_D1D);
 
  662   static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
  663   const int D1D = T_D1D ? T_D1D : d1d;
 
  664   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  667   MFEM_VERIFY(D1D <= max_d1d, 
"");
 
  668   MFEM_VERIFY(Q1D <= max_q1d, 
"");
 
  669   auto b = 
Reshape(b_.Read(), Q1D, D1D);
 
  670   auto g = 
Reshape(g_.Read(), Q1D, D1D);
 
  671   auto D = 
Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
 
  672   auto x = 
Reshape(x_.Read(), D1D, D1D, NE);
 
  673   auto Y = 
Reshape(y_.ReadWrite(), D1D, D1D, NE);
 
  676      const int tidz = MFEM_THREAD_ID(z);
 
  677      const int D1D = T_D1D ? T_D1D : d1d;
 
  678      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  679      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  680      constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  681      MFEM_SHARED 
real_t sBG[2][MQ1*MD1];
 
  686      MFEM_SHARED 
real_t Xz[NBZ][MD1][MD1];
 
  687      MFEM_SHARED 
real_t GD[2][NBZ][MD1][MQ1];
 
  688      MFEM_SHARED 
real_t GQ[2][NBZ][MD1][MQ1];
 
  694      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  696         MFEM_FOREACH_THREAD(dx,x,D1D)
 
  698            X[dy][dx] = x(dx,dy,e);
 
  703         MFEM_FOREACH_THREAD(dy,y,D1D)
 
  705            MFEM_FOREACH_THREAD(q,x,Q1D)
 
  713      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  715         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  719            for (
int dx = 0; dx < D1D; ++dx)
 
  721               const real_t coords = X[dy][dx];
 
  722               u += B[qx][dx] * coords;
 
  723               v += G[qx][dx] * coords;
 
  730      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  732         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  736            for (
int dy = 0; dy < D1D; ++dy)
 
  738               u += DQ1[dy][qx] * B[qy][dy];
 
  739               v += DQ0[dy][qx] * G[qy][dy];
 
  746      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  748         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  750            const int q = (qx + ((qy) * Q1D));
 
  751            const real_t O11 = D(q,0,e);
 
  752            const real_t O21 = D(q,1,e);
 
  753            const real_t O12 = symmetric ? O21 : D(q,2,e);
 
  754            const real_t O22 = symmetric ? D(q,2,e) : D(q,3,e);
 
  755            const real_t gX = QQ0[qy][qx];
 
  756            const real_t gY = QQ1[qy][qx];
 
  757            QQ0[qy][qx] = (O11 * gX) + (O12 * gY);
 
  758            QQ1[qy][qx] = (O21 * gX) + (O22 * gY);
 
  764         MFEM_FOREACH_THREAD(dy,y,D1D)
 
  766            MFEM_FOREACH_THREAD(q,x,Q1D)
 
  774      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  776         MFEM_FOREACH_THREAD(dx,x,D1D)
 
  780            for (
int qx = 0; qx < Q1D; ++qx)
 
  782               u += Gt[dx][qx] * QQ0[qy][qx];
 
  783               v += Bt[dx][qx] * QQ1[qy][qx];
 
  790      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  792         MFEM_FOREACH_THREAD(dx,x,D1D)
 
  796            for (
int qy = 0; qy < Q1D; ++qy)
 
  798               u += DQ0[qy][dx] * Bt[dy][qy];
 
  799               v += DQ1[qy][dx] * Gt[dy][qy];
 
  801            Y(dx,dy,e) += (
u + v);
 
  808template<
int T_D1D = 0, 
int T_Q1D = 0>
 
  809inline void PADiffusionApply3D(
const int NE,
 
  810                               const bool symmetric,
 
  811                               const Array<real_t> &
b,
 
  812                               const Array<real_t> &g,
 
  813                               const Array<real_t> &bt,
 
  814                               const Array<real_t> >,
 
  818                               int d1d = 0, 
int q1d = 0)
 
  820   const int D1D = T_D1D ? T_D1D : d1d;
 
  821   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  824   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  825   auto G = 
Reshape(g.Read(), Q1D, D1D);
 
  826   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
  827   auto Gt = 
Reshape(gt.Read(), D1D, Q1D);
 
  828   auto D = 
Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
 
  829   auto X = 
Reshape(x_.Read(), D1D, D1D, D1D, NE);
 
  830   auto Y = 
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
 
  833      const int D1D = T_D1D ? T_D1D : d1d;
 
  834      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  835      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  836      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  837      real_t grad[max_Q1D][max_Q1D][max_Q1D][3];
 
  838      for (
int qz = 0; qz < Q1D; ++qz)
 
  840         for (
int qy = 0; qy < Q1D; ++qy)
 
  842            for (
int qx = 0; qx < Q1D; ++qx)
 
  844               grad[qz][qy][qx][0] = 0.0;
 
  845               grad[qz][qy][qx][1] = 0.0;
 
  846               grad[qz][qy][qx][2] = 0.0;
 
  850      for (
int dz = 0; dz < D1D; ++dz)
 
  852         real_t gradXY[max_Q1D][max_Q1D][3];
 
  853         for (
int qy = 0; qy < Q1D; ++qy)
 
  855            for (
int qx = 0; qx < Q1D; ++qx)
 
  857               gradXY[qy][qx][0] = 0.0;
 
  858               gradXY[qy][qx][1] = 0.0;
 
  859               gradXY[qy][qx][2] = 0.0;
 
  862         for (
int dy = 0; dy < D1D; ++dy)
 
  865            for (
int qx = 0; qx < Q1D; ++qx)
 
  870            for (
int dx = 0; dx < D1D; ++dx)
 
  872               const real_t s = X(dx,dy,dz,e);
 
  873               for (
int qx = 0; qx < Q1D; ++qx)
 
  875                  gradX[qx][0] += s * B(qx,dx);
 
  876                  gradX[qx][1] += s * G(qx,dx);
 
  879            for (
int qy = 0; qy < Q1D; ++qy)
 
  881               const real_t wy  = B(qy,dy);
 
  882               const real_t wDy = G(qy,dy);
 
  883               for (
int qx = 0; qx < Q1D; ++qx)
 
  885                  const real_t wx  = gradX[qx][0];
 
  886                  const real_t wDx = gradX[qx][1];
 
  887                  gradXY[qy][qx][0] += wDx * wy;
 
  888                  gradXY[qy][qx][1] += wx  * wDy;
 
  889                  gradXY[qy][qx][2] += wx  * wy;
 
  893         for (
int qz = 0; qz < Q1D; ++qz)
 
  895            const real_t wz  = B(qz,dz);
 
  896            const real_t wDz = G(qz,dz);
 
  897            for (
int qy = 0; qy < Q1D; ++qy)
 
  899               for (
int qx = 0; qx < Q1D; ++qx)
 
  901                  grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
 
  902                  grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
 
  903                  grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
 
  909      for (
int qz = 0; qz < Q1D; ++qz)
 
  911         for (
int qy = 0; qy < Q1D; ++qy)
 
  913            for (
int qx = 0; qx < Q1D; ++qx)
 
  915               const int q = qx + (qy + qz * Q1D) * Q1D;
 
  916               const real_t O11 = D(q,0,e);
 
  917               const real_t O12 = D(q,1,e);
 
  918               const real_t O13 = D(q,2,e);
 
  919               const real_t O21 = symmetric ? O12 : D(q,3,e);
 
  920               const real_t O22 = symmetric ? D(q,3,e) : D(q,4,e);
 
  921               const real_t O23 = symmetric ? D(q,4,e) : D(q,5,e);
 
  922               const real_t O31 = symmetric ? O13 : D(q,6,e);
 
  923               const real_t O32 = symmetric ? O23 : D(q,7,e);
 
  924               const real_t O33 = symmetric ? D(q,5,e) : D(q,8,e);
 
  925               const real_t gradX = grad[qz][qy][qx][0];
 
  926               const real_t gradY = grad[qz][qy][qx][1];
 
  927               const real_t gradZ = grad[qz][qy][qx][2];
 
  928               grad[qz][qy][qx][0] = (O11*gradX)+(O12*gradY)+(O13*gradZ);
 
  929               grad[qz][qy][qx][1] = (O21*gradX)+(O22*gradY)+(O23*gradZ);
 
  930               grad[qz][qy][qx][2] = (O31*gradX)+(O32*gradY)+(O33*gradZ);
 
  934      for (
int qz = 0; qz < Q1D; ++qz)
 
  936         real_t gradXY[max_D1D][max_D1D][3];
 
  937         for (
int dy = 0; dy < D1D; ++dy)
 
  939            for (
int dx = 0; dx < D1D; ++dx)
 
  941               gradXY[dy][dx][0] = 0;
 
  942               gradXY[dy][dx][1] = 0;
 
  943               gradXY[dy][dx][2] = 0;
 
  946         for (
int qy = 0; qy < Q1D; ++qy)
 
  949            for (
int dx = 0; dx < D1D; ++dx)
 
  955            for (
int qx = 0; qx < Q1D; ++qx)
 
  957               const real_t gX = grad[qz][qy][qx][0];
 
  958               const real_t gY = grad[qz][qy][qx][1];
 
  959               const real_t gZ = grad[qz][qy][qx][2];
 
  960               for (
int dx = 0; dx < D1D; ++dx)
 
  962                  const real_t wx  = Bt(dx,qx);
 
  963                  const real_t wDx = Gt(dx,qx);
 
  964                  gradX[dx][0] += gX * wDx;
 
  965                  gradX[dx][1] += gY * wx;
 
  966                  gradX[dx][2] += gZ * wx;
 
  969            for (
int dy = 0; dy < D1D; ++dy)
 
  971               const real_t wy  = Bt(dy,qy);
 
  972               const real_t wDy = Gt(dy,qy);
 
  973               for (
int dx = 0; dx < D1D; ++dx)
 
  975                  gradXY[dy][dx][0] += gradX[dx][0] * wy;
 
  976                  gradXY[dy][dx][1] += gradX[dx][1] * wDy;
 
  977                  gradXY[dy][dx][2] += gradX[dx][2] * wy;
 
  981         for (
int dz = 0; dz < D1D; ++dz)
 
  983            const real_t wz  = Bt(dz,qz);
 
  984            const real_t wDz = Gt(dz,qz);
 
  985            for (
int dy = 0; dy < D1D; ++dy)
 
  987               for (
int dx = 0; dx < D1D; ++dx)
 
  990                     ((gradXY[dy][dx][0] * wz) +
 
  991                      (gradXY[dy][dx][1] * wz) +
 
  992                      (gradXY[dy][dx][2] * wDz));
 
 1001template<
int T_D1D = 0, 
int T_Q1D = 0>
 
 1002inline void SmemPADiffusionApply3D(
const int NE,
 
 1003                                   const bool symmetric,
 
 1004                                   const Array<real_t> &b_,
 
 1005                                   const Array<real_t> &g_,
 
 1006                                   const Array<real_t> &,
 
 1007                                   const Array<real_t> &,
 
 1014   const int D1D = T_D1D ? T_D1D : d1d;
 
 1015   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1018   MFEM_VERIFY(D1D <= max_d1d, 
"");
 
 1019   MFEM_VERIFY(Q1D <= max_q1d, 
"");
 
 1020   auto b = 
Reshape(b_.Read(), Q1D, D1D);
 
 1021   auto g = 
Reshape(g_.Read(), Q1D, D1D);
 
 1022   auto d = 
Reshape(d_.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
 
 1023   auto x = 
Reshape(x_.Read(), D1D, D1D, D1D, NE);
 
 1024   auto y = 
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
 
 1027      const int D1D = T_D1D ? T_D1D : d1d;
 
 1028      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
 1029      constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
 1030      constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
 1031      constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
 
 1032      MFEM_SHARED 
real_t sBG[2][MQ1*MD1];
 
 1037      MFEM_SHARED 
real_t sm0[3][MDQ*MDQ*MDQ];
 
 1038      MFEM_SHARED 
real_t sm1[3][MDQ*MDQ*MDQ];
 
 1039      real_t (*X)[MD1][MD1]    = (
real_t (*)[MD1][MD1]) (sm0+2);
 
 1040      real_t (*DDQ0)[MD1][MQ1] = (
real_t (*)[MD1][MQ1]) (sm0+0);
 
 1041      real_t (*DDQ1)[MD1][MQ1] = (
real_t (*)[MD1][MQ1]) (sm0+1);
 
 1042      real_t (*DQQ0)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+0);
 
 1043      real_t (*DQQ1)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+1);
 
 1044      real_t (*DQQ2)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+2);
 
 1045      real_t (*QQQ0)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+0);
 
 1046      real_t (*QQQ1)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+1);
 
 1047      real_t (*QQQ2)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+2);
 
 1048      real_t (*QQD0)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+0);
 
 1049      real_t (*QQD1)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+1);
 
 1050      real_t (*QQD2)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+2);
 
 1051      real_t (*QDD0)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+0);
 
 1052      real_t (*QDD1)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+1);
 
 1053      real_t (*QDD2)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+2);
 
 1054      MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1056         MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1058            MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1060               X[dz][dy][dx] = x(dx,dy,dz,e);
 
 1064      if (MFEM_THREAD_ID(z) == 0)
 
 1066         MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1068            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1070               B[qx][dy] = 
b(qx,dy);
 
 1071               G[qx][dy] = g(qx,dy);
 
 1076      MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1078         MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1080            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1084               for (
int dx = 0; dx < D1D; ++dx)
 
 1086                  const real_t coords = X[dz][dy][dx];
 
 1087                  u += coords * B[qx][dx];
 
 1088                  v += coords * G[qx][dx];
 
 1090               DDQ0[dz][dy][qx] = 
u;
 
 1091               DDQ1[dz][dy][qx] = v;
 
 1096      MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1098         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1100            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1102               real_t u = 0.0, v = 0.0, w = 0.0;
 
 1104               for (
int dy = 0; dy < D1D; ++dy)
 
 1106                  u += DDQ1[dz][dy][qx] * B[qy][dy];
 
 1107                  v += DDQ0[dz][dy][qx] * G[qy][dy];
 
 1108                  w += DDQ0[dz][dy][qx] * B[qy][dy];
 
 1110               DQQ0[dz][qy][qx] = 
u;
 
 1111               DQQ1[dz][qy][qx] = v;
 
 1112               DQQ2[dz][qy][qx] = w;
 
 1117      MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1119         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1121            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1123               real_t u = 0.0, v = 0.0, w = 0.0;
 
 1125               for (
int dz = 0; dz < D1D; ++dz)
 
 1127                  u += DQQ0[dz][qy][qx] * B[qz][dz];
 
 1128                  v += DQQ1[dz][qy][qx] * B[qz][dz];
 
 1129                  w += DQQ2[dz][qy][qx] * G[qz][dz];
 
 1131               const real_t O11 = d(qx,qy,qz,0,e);
 
 1132               const real_t O12 = d(qx,qy,qz,1,e);
 
 1133               const real_t O13 = d(qx,qy,qz,2,e);
 
 1134               const real_t O21 = symmetric ? O12 : d(qx,qy,qz,3,e);
 
 1135               const real_t O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e);
 
 1136               const real_t O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e);
 
 1137               const real_t O31 = symmetric ? O13 : d(qx,qy,qz,6,e);
 
 1138               const real_t O32 = symmetric ? O23 : d(qx,qy,qz,7,e);
 
 1139               const real_t O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e);
 
 1143               QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ);
 
 1144               QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ);
 
 1145               QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ);
 
 1150      if (MFEM_THREAD_ID(z) == 0)
 
 1152         MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1154            MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1156               Bt[dy][qx] = 
b(qx,dy);
 
 1157               Gt[dy][qx] = g(qx,dy);
 
 1162      MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1164         MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1166            MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1168               real_t u = 0.0, v = 0.0, w = 0.0;
 
 1170               for (
int qx = 0; qx < Q1D; ++qx)
 
 1172                  u += QQQ0[qz][qy][qx] * Gt[dx][qx];
 
 1173                  v += QQQ1[qz][qy][qx] * Bt[dx][qx];
 
 1174                  w += QQQ2[qz][qy][qx] * Bt[dx][qx];
 
 1176               QQD0[qz][qy][dx] = 
u;
 
 1177               QQD1[qz][qy][dx] = v;
 
 1178               QQD2[qz][qy][dx] = w;
 
 1183      MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1185         MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1187            MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1189               real_t u = 0.0, v = 0.0, w = 0.0;
 
 1191               for (
int qy = 0; qy < Q1D; ++qy)
 
 1193                  u += QQD0[qz][qy][dx] * Bt[dy][qy];
 
 1194                  v += QQD1[qz][qy][dx] * Gt[dy][qy];
 
 1195                  w += QQD2[qz][qy][dx] * Bt[dy][qy];
 
 1197               QDD0[qz][dy][dx] = 
u;
 
 1198               QDD1[qz][dy][dx] = v;
 
 1199               QDD2[qz][dy][dx] = w;
 
 1204      MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1206         MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1208            MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1210               real_t u = 0.0, v = 0.0, w = 0.0;
 
 1212               for (
int qz = 0; qz < Q1D; ++qz)
 
 1214                  u += QDD0[qz][dy][dx] * Bt[dz][qz];
 
 1215                  v += QDD1[qz][dy][dx] * Bt[dz][qz];
 
 1216                  w += QDD2[qz][dy][dx] * Gt[dz][qz];
 
 1218               y(dx,dy,dz,e) += (
u + v + w);
 
 1233template<
int DIM, 
int T_D1D, 
int T_Q1D>
 
 1234ApplyKernelType DiffusionIntegrator::ApplyPAKernels::Kernel()
 
 1236   if (
DIM == 2) { 
return internal::SmemPADiffusionApply2D<T_D1D,T_Q1D>; }
 
 1237   else if (
DIM == 3) { 
return internal::SmemPADiffusionApply3D<T_D1D, T_Q1D>; }
 
 1238   else { MFEM_ABORT(
""); }
 
 1242ApplyKernelType DiffusionIntegrator::ApplyPAKernels::Fallback(
int DIM, 
int, 
int)
 
 1244   if (
DIM == 2) { 
return internal::PADiffusionApply2D; }
 
 1245   else if (
DIM == 3) { 
return internal::PADiffusionApply3D; }
 
 1246   else { MFEM_ABORT(
""); }
 
 1249template<
int DIM, 
int D1D, 
int Q1D>
 
 1250DiagonalKernelType DiffusionIntegrator::DiagonalPAKernels::Kernel()
 
 1252   if (
DIM == 2) { 
return internal::SmemPADiffusionDiagonal2D<D1D,Q1D>; }
 
 1253   else if (
DIM == 3) { 
return internal::SmemPADiffusionDiagonal3D<D1D, Q1D>; }
 
 1254   else { MFEM_ABORT(
""); }
 
 1257inline DiagonalKernelType
 
 1258DiffusionIntegrator::DiagonalPAKernels::Fallback(
int DIM, 
int, 
int)
 
 1260   if (
DIM == 2) { 
return internal::PADiffusionDiagonal2D; }
 
 1261   else if (
DIM == 3) { 
return internal::PADiffusionDiagonal3D; }
 
 1262   else { MFEM_ABORT(
""); }
 
void(*)(const int, const bool, const Array< real_t > &, const Array< real_t > &, const Vector &, Vector &, const int, const int) DiagonalKernelType
void(*)(const int, const bool, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
real_t u(const Vector &xvec)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
void forall(int N, lambda &&body)
real_t p(const Vector &x, real_t t)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
int MAX_D1D
Maximum number of 1D nodal points.
int MAX_Q1D
Maximum number of 1D quadrature points.