12#ifndef MFEM_FEM_KERNELS_HPP 
   13#define MFEM_FEM_KERNELS_HPP 
   30template<
int MD1, 
int MQ1>
 
   31MFEM_HOST_DEVICE 
inline void LoadB(
const int D1D, 
const int Q1D,
 
   35   const int tidz = MFEM_THREAD_ID(z);
 
   40      MFEM_FOREACH_THREAD(d,y,D1D)
 
   42         MFEM_FOREACH_THREAD(q,x,Q1D)
 
   52template<
int MD1, 
int MQ1>
 
   53MFEM_HOST_DEVICE 
inline void LoadBt(
const int D1D, 
const int Q1D,
 
   57   const int tidz = MFEM_THREAD_ID(z);
 
   62      MFEM_FOREACH_THREAD(d,y,D1D)
 
   64         MFEM_FOREACH_THREAD(q,x,Q1D)
 
   74template<
int MD1, 
int MQ1>
 
   75MFEM_HOST_DEVICE 
inline void LoadBG(
const int D1D, 
const int Q1D,
 
   80   const int tidz = MFEM_THREAD_ID(z);
 
   86      MFEM_FOREACH_THREAD(d,y,D1D)
 
   88         MFEM_FOREACH_THREAD(q,x,Q1D)
 
   99template<
int MD1, 
int MQ1>
 
  100MFEM_HOST_DEVICE 
inline void LoadBGt(
const int D1D, 
const int Q1D,
 
  103                                     real_t (&sBG)[2][MQ1*MD1])
 
  105   const int tidz = MFEM_THREAD_ID(z);
 
  111      MFEM_FOREACH_THREAD(d,y,D1D)
 
  113         MFEM_FOREACH_THREAD(q,x,Q1D)
 
  124MFEM_HOST_DEVICE 
inline void LoadX(
const int e, 
const int D1D,
 
  125                                   const DeviceTensor<3, const real_t> &x,
 
  128   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  130      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  132         DD(dx,dy) = x(dx,dy,e);
 
  140template<
int MD1, 
int NBZ>
 
  141MFEM_HOST_DEVICE 
inline void LoadX(
const int e, 
const int D1D,
 
  142                                   const DeviceTensor<3, const real_t> &x,
 
  143                                   real_t (&sX)[NBZ][MD1*MD1])
 
  145   const int tidz = MFEM_THREAD_ID(z);
 
  151MFEM_HOST_DEVICE 
inline void LoadX(
const int e, 
const int D1D, 
const int c,
 
  152                                   const DeviceTensor<4, const real_t> &x,
 
  155   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  157      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  159         DD(dx,dy) = x(dx,dy,c,e);
 
  165template<
int MD1, 
int NBZ>
 
  166MFEM_HOST_DEVICE 
inline void LoadX(
const int e, 
const int D1D, 
const int c,
 
  167                                   const DeviceTensor<4, const real_t> &x,
 
  168                                   real_t (&sm)[NBZ][MD1*MD1])
 
  170   const int tidz = MFEM_THREAD_ID(z);
 
  176MFEM_HOST_DEVICE 
inline void EvalX(
const int D1D, 
const int Q1D,
 
  181   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  183      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  186         for (
int dx = 0; dx < D1D; ++dx)
 
  188            u += B(dx,qx) * DD(dx,dy);
 
  196template<
int MD1, 
int MQ1, 
int NBZ>
 
  197MFEM_HOST_DEVICE 
inline void EvalX(
const int D1D, 
const int Q1D,
 
  198                                   const real_t (&sB)[MQ1*MD1],
 
  199                                   real_t (&sDD)[NBZ][MD1*MD1],
 
  200                                   real_t (&sDQ)[NBZ][MD1*MQ1])
 
  202   const int tidz = MFEM_THREAD_ID(z);
 
  206   EvalX(D1D,Q1D,B,DD,DQ);
 
  210MFEM_HOST_DEVICE 
inline void EvalY(
const int D1D, 
const int Q1D,
 
  215   MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  217      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  220         for (
int dy = 0; dy < D1D; ++dy)
 
  222            u += DQ(dy,qx) * B(dy,qy);
 
  230template<
int MD1, 
int MQ1, 
int NBZ>
 
  231MFEM_HOST_DEVICE 
inline void EvalY(
const int D1D, 
const int Q1D,
 
  232                                   const real_t (&sB)[MQ1*MD1],
 
  233                                   real_t (&sDQ)[NBZ][MD1*MQ1],
 
  234                                   real_t (&sQQ)[NBZ][MQ1*MQ1])
 
  236   const int tidz = MFEM_THREAD_ID(z);
 
  240   EvalY(D1D,Q1D,B,DQ,QQ);
 
  244MFEM_HOST_DEVICE 
inline void PullEval(
const int qx, 
const int qy,
 
  251template<
int MQ1, 
int NBZ>
 
  252MFEM_HOST_DEVICE 
inline void PullEval(
const int Q1D,
 
  253                                      const int qx, 
const int qy,
 
  254                                      real_t (&sQQ)[NBZ][MQ1*MQ1],
 
  257   const int tidz = MFEM_THREAD_ID(z);
 
  259   PullEval(qx,qy,QQ,P);
 
  263template<
int MD1, 
int NBZ>
 
  264MFEM_HOST_DEVICE 
inline void LoadX(
const int e, 
const int D1D,
 
  265                                   const DeviceTensor<4, const real_t> &X,
 
  266                                   real_t (&sX)[2][NBZ][MD1*MD1])
 
  268   const int tidz = MFEM_THREAD_ID(z);
 
  272   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  274      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  276         X0(dx,dy) = X(dx,dy,0,e);
 
  277         X1(dx,dy) = X(dx,dy,1,e);
 
  284template<
int MD1, 
int MQ1, 
int NBZ>
 
  285MFEM_HOST_DEVICE 
inline void EvalX(
const int D1D, 
const int Q1D,
 
  286                                   const real_t (&sB)[MQ1*MD1],
 
  287                                   const real_t (&sX)[2][NBZ][MD1*MD1],
 
  288                                   real_t (&sDQ)[2][NBZ][MD1*MQ1])
 
  290   const int tidz = MFEM_THREAD_ID(z);
 
  297   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  299      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  302         for (
int dx = 0; dx < D1D; ++dx)
 
  304            const real_t xx = X0(dx,dy);
 
  305            const real_t xy = X1(dx,dy);
 
  306            u[0] += B(dx,qx) * xx;
 
  307            u[1] += B(dx,qx) * xy;
 
  317template<
int MD1, 
int MQ1, 
int NBZ>
 
  318MFEM_HOST_DEVICE 
inline void EvalY(
const int D1D, 
const int Q1D,
 
  319                                   const real_t (&sB)[MQ1*MD1],
 
  320                                   const real_t (&sDQ)[2][NBZ][MD1*MQ1],
 
  321                                   real_t (&sQQ)[2][NBZ][MQ1*MQ1])
 
  323   const int tidz = MFEM_THREAD_ID(z);
 
  330   MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  332      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  335         for (
int dy = 0; dy < D1D; ++dy)
 
  337            u[0] += DQ0(qx,dy) * B(dy,qy);
 
  338            u[1] += DQ1(qx,dy) * B(dy,qy);
 
  348template<
int MQ1, 
int NBZ>
 
  349MFEM_HOST_DEVICE 
inline void PullEval(
const int Q1D,
 
  350                                      const int qx, 
const int qy,
 
  351                                      const real_t (&sQQ)[2][NBZ][MQ1*MQ1],
 
  354   const int tidz = MFEM_THREAD_ID(z);
 
  363template<
int MQ1, 
int NBZ>
 
  364MFEM_HOST_DEVICE 
inline void PushEval(
const int Q1D,
 
  365                                      const int qx, 
const int qy,
 
  367                                      real_t (&sQQ)[2][NBZ][MQ1*MQ1])
 
  369   const int tidz = MFEM_THREAD_ID(z);
 
  378template<
int MD1, 
int MQ1, 
int NBZ>
 
  379MFEM_HOST_DEVICE 
inline void EvalXt(
const int D1D, 
const int Q1D,
 
  380                                    const real_t (&sB)[MQ1*MD1],
 
  381                                    const real_t (&sQQ)[2][NBZ][MQ1*MQ1],
 
  382                                    real_t (&sDQ)[2][NBZ][MD1*MQ1])
 
  384   const int tidz = MFEM_THREAD_ID(z);
 
  391   MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  393      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  396         for (
int qx = 0; qx < Q1D; ++qx)
 
  398            u[0] += QQ0(qx,qy) * Bt(qx,dx);
 
  399            u[1] += QQ1(qx,qy) * Bt(qx,dx);
 
  409template<
int MD1, 
int MQ1, 
int NBZ>
 
  410MFEM_HOST_DEVICE 
inline void EvalYt(
const int D1D, 
const int Q1D,
 
  411                                    const real_t (&sB)[MQ1*MD1],
 
  412                                    const real_t (&sDQ)[2][NBZ][MD1*MQ1],
 
  413                                    const DeviceTensor<4> &Y, 
 
  416   const int tidz = MFEM_THREAD_ID(z);
 
  421   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  423      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  426         for (
int qy = 0; qy < Q1D; ++qy)
 
  428            u[0] += Bt(qy,dy) * DQ0(qy,dx);
 
  429            u[1] += Bt(qy,dy) * DQ1(qy,dx);
 
  431         Y(dx,dy,0,e) += 
u[0];
 
  432         Y(dx,dy,1,e) += 
u[1];
 
  439template<
int MD1, 
int MQ1, 
int NBZ>
 
  440MFEM_HOST_DEVICE 
inline void GradX(
const int D1D, 
const int Q1D,
 
  441                                   const real_t (&sBG)[2][MQ1*MD1],
 
  442                                   const real_t (&sX)[2][NBZ][MD1*MD1],
 
  443                                   real_t (&sDQ)[4][NBZ][MD1*MQ1])
 
  445   const int tidz = MFEM_THREAD_ID(z);
 
  455   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  457      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  461         for (
int dx = 0; dx < D1D; ++dx)
 
  463            const real_t Bx = B(dx,qx);
 
  464            const real_t Gx = G(dx,qx);
 
  465            const real_t x0 = X0(dx,dy);
 
  466            const real_t x1 = X1(dx,dy);
 
  482template<
int MD1, 
int MQ1, 
int NBZ>
 
  483MFEM_HOST_DEVICE 
inline void GradY(
const int D1D, 
const int Q1D,
 
  484                                   const real_t (&sBG)[2][MQ1*MD1],
 
  485                                   const real_t (&sDQ)[4][NBZ][MD1*MQ1],
 
  486                                   real_t (&sQQ)[4][NBZ][MQ1*MQ1])
 
  488   const int tidz = MFEM_THREAD_ID(z);
 
  500   MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  502      MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  506         for (
int dy = 0; dy < D1D; ++dy)
 
  508            const real_t By = B(dy,qy);
 
  509            const real_t Gy = G(dy,qy);
 
  510            u[0] += X0G(qx,dy) * By;
 
  511            v[0] += X0B(qx,dy) * Gy;
 
  512            u[1] += X1G(qx,dy) * By;
 
  513            v[1] += X1B(qx,dy) * Gy;
 
  525template<
int MQ1, 
int NBZ>
 
  526MFEM_HOST_DEVICE 
inline void PullGrad(
const int Q1D,
 
  527                                      const int qx, 
const int qy,
 
  528                                      const real_t (&sQQ)[4][NBZ][MQ1*MQ1],
 
  531   const int tidz = MFEM_THREAD_ID(z);
 
  537   Jpr[0] = X0GB(qx,qy);
 
  538   Jpr[1] = X1GB(qx,qy);
 
  539   Jpr[2] = X0BG(qx,qy);
 
  540   Jpr[3] = X1BG(qx,qy);
 
  544template<
int MQ1, 
int NBZ>
 
  545MFEM_HOST_DEVICE 
inline void PushGrad(
const int Q1D,
 
  546                                      const int qx, 
const int qy,
 
  548                                      real_t (&sQQ)[4][NBZ][MQ1*MQ1])
 
  550   const int tidz = MFEM_THREAD_ID(z);
 
  563template<
int MD1, 
int MQ1, 
int NBZ>
 
  564MFEM_HOST_DEVICE 
inline void GradYt(
const int D1D, 
const int Q1D,
 
  565                                    const real_t (&sBG)[2][MQ1*MD1],
 
  566                                    const real_t (&GQ)[4][NBZ][MQ1*MQ1],
 
  567                                    real_t (&GD)[4][NBZ][MD1*MQ1])
 
  569   const int tidz = MFEM_THREAD_ID(z);
 
  581   MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  583      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  587         for (
int qx = 0; qx < Q1D; ++qx)
 
  589            u[0] += Gt(qx,dx) * QQx0(qx,qy);
 
  590            u[1] += Gt(qx,dx) * QQy0(qx,qy);
 
  591            v[0] += Bt(qx,dx) * QQx1(qx,qy);
 
  592            v[1] += Bt(qx,dx) * QQy1(qx,qy);
 
  604template<
int MD1, 
int MQ1, 
int NBZ>
 
  605MFEM_HOST_DEVICE 
inline void GradXt(
const int D1D, 
const int Q1D,
 
  606                                    const real_t (&sBG)[2][MQ1*MD1],
 
  607                                    const real_t (&GD)[4][NBZ][MD1*MQ1],
 
  608                                    const DeviceTensor<4> &Y, 
 
  611   const int tidz = MFEM_THREAD_ID(z);
 
  619   MFEM_FOREACH_THREAD(dy,y,D1D)
 
  621      MFEM_FOREACH_THREAD(dx,x,D1D)
 
  625         for (
int qy = 0; qy < Q1D; ++qy)
 
  627            u[0] += DQxB(qy,dx) * Bt(qy,dy);
 
  628            u[1] += DQyB(qy,dx) * Bt(qy,dy);
 
  629            v[0] += DQxG(qy,dx) * Gt(qy,dy);
 
  630            v[1] += DQyG(qy,dx) * Gt(qy,dy);
 
  632         Y(dx,dy,0,e) += 
u[0] + v[0];
 
  633         Y(dx,dy,1,e) += 
u[1] + v[1];
 
  640MFEM_HOST_DEVICE 
inline void LoadX(
const int e, 
const int D1D,
 
  641                                   const DeviceTensor<4, const real_t> &x,
 
  644   MFEM_FOREACH_THREAD(dz,z,D1D)
 
  646      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  648         MFEM_FOREACH_THREAD(dx,x,D1D)
 
  650            X(dx,dy,dz) = x(dx,dy,dz,e);
 
  658MFEM_HOST_DEVICE 
inline void LoadX(
const int e, 
const int D1D,
 
  659                                   const DeviceTensor<4, const real_t> &x,
 
  660                                   real_t (&sm)[MD1*MD1*MD1])
 
  667MFEM_HOST_DEVICE 
inline void LoadX(
const int e, 
const int D1D, 
const int c,
 
  668                                   const DeviceTensor<5, const real_t> &x,
 
  671   MFEM_FOREACH_THREAD(dz,z,D1D)
 
  673      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  675         MFEM_FOREACH_THREAD(dx,x,D1D)
 
  677            X(dx,dy,dz) = x(dx,dy,dz,c,e);
 
  686MFEM_HOST_DEVICE 
inline void LoadX(
const int e, 
const int D1D, 
const int c,
 
  687                                   const DeviceTensor<5, const real_t> &x,
 
  688                                   real_t (&sm)[MD1*MD1*MD1])
 
  691   return LoadX<MD1>(e,D1D,c,x,X);
 
  695MFEM_HOST_DEVICE 
inline void EvalX(
const int D1D, 
const int Q1D,
 
  700   MFEM_FOREACH_THREAD(dz,z,D1D)
 
  702      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  704         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  707            for (
int dx = 0; dx < D1D; ++dx)
 
  709               const real_t Bx = B(dx,qx);
 
  710               u += Bx * DDD(dx,dy,dz);
 
  719template<
int MD1, 
int MQ1>
 
  720MFEM_HOST_DEVICE 
inline void EvalX(
const int D1D, 
const int Q1D,
 
  721                                   const real_t (&sB)[MQ1*MD1],
 
  722                                   const real_t (&sDDD)[MD1*MD1*MD1],
 
  723                                   real_t (&sDDQ)[MD1*MD1*MQ1])
 
  728   EvalX(D1D,Q1D,B,DDD,DDQ);
 
  732MFEM_HOST_DEVICE 
inline void EvalY(
const int D1D, 
const int Q1D,
 
  737   MFEM_FOREACH_THREAD(dz,z,D1D)
 
  739      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  741         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  744            for (
int dy = 0; dy < D1D; ++dy)
 
  746               const real_t By = B(dy,qy);
 
  747               u += DDQ(dz,dy,qx) * By;
 
  756template<
int MD1, 
int MQ1>
 
  757MFEM_HOST_DEVICE 
inline void EvalY(
const int D1D, 
const int Q1D,
 
  758                                   const real_t (&sB)[MQ1*MD1],
 
  759                                   const real_t (&sDDQ)[MD1*MD1*MQ1],
 
  760                                   real_t (&sDQQ)[MD1*MQ1*MQ1])
 
  765   EvalY(D1D,Q1D,B,DDQ,DQQ);
 
  769MFEM_HOST_DEVICE 
inline void EvalZ(
const int D1D, 
const int Q1D,
 
  774   MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  776      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  778         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  781            for (
int dz = 0; dz < D1D; ++dz)
 
  783               const real_t Bz = B(dz,qz);
 
  784               u += DQQ(dz,qy,qx) * Bz;
 
  793template<
int MD1, 
int MQ1>
 
  794MFEM_HOST_DEVICE 
inline void EvalZ(
const int D1D, 
const int Q1D,
 
  795                                   const real_t (&sB)[MQ1*MD1],
 
  796                                   const real_t (&sDQQ)[MD1*MQ1*MQ1],
 
  797                                   real_t (&sQQQ)[MQ1*MQ1*MQ1])
 
  802   EvalZ(D1D,Q1D,B,DQQ,QQQ);
 
  806MFEM_HOST_DEVICE 
inline void PullEval(
const int x, 
const int y, 
const int z,
 
  814MFEM_HOST_DEVICE 
inline void PullEval(
const int Q1D,
 
  815                                      const int x, 
const int y, 
const int z,
 
  816                                      const real_t (&sQQQ)[MQ1*MQ1*MQ1],
 
  820   PullEval(x,y,z,QQQ,X);
 
  825MFEM_HOST_DEVICE 
inline void LoadX(
const int e, 
const int D1D,
 
  826                                   const DeviceTensor<5, const real_t> &X,
 
  827                                   real_t (*sm)[MD1*MD1*MD1])
 
  833   MFEM_FOREACH_THREAD(dz,z,D1D)
 
  835      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  837         MFEM_FOREACH_THREAD(dx,x,D1D)
 
  839            Xx(dx,dy,dz) = X(dx,dy,dz,0,e);
 
  840            Xy(dx,dy,dz) = X(dx,dy,dz,1,e);
 
  841            Xz(dx,dy,dz) = X(dx,dy,dz,2,e);
 
  849template<
int MD1, 
int MQ1>
 
  850MFEM_HOST_DEVICE 
inline void EvalX(
const int D1D, 
const int Q1D,
 
  851                                   const real_t (&sB)[MQ1*MD1],
 
  852                                   const real_t (&sDDD)[3][MD1*MD1*MD1],
 
  853                                   real_t (&sDDQ)[3][MD1*MD1*MQ1])
 
  863   MFEM_FOREACH_THREAD(dz,z,D1D)
 
  865      MFEM_FOREACH_THREAD(dy,y,D1D)
 
  867         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  870            for (
int dx = 0; dx < D1D; ++dx)
 
  872               const real_t Bx = B(dx,qx);
 
  873               u[0] += Bx * Xx(dx,dy,dz);
 
  874               u[1] += Bx * Xy(dx,dy,dz);
 
  875               u[2] += Bx * Xz(dx,dy,dz);
 
  877            XxB(qx,dy,dz) = 
u[0];
 
  878            XyB(qx,dy,dz) = 
u[1];
 
  879            XzB(qx,dy,dz) = 
u[2];
 
  887template<
int MD1, 
int MQ1>
 
  888MFEM_HOST_DEVICE 
inline void EvalY(
const int D1D, 
const int Q1D,
 
  889                                   const real_t (&sB)[MQ1*MD1],
 
  890                                   const real_t (&sDDQ)[3][MD1*MD1*MQ1],
 
  891                                   real_t (&sDQQ)[3][MD1*MQ1*MQ1])
 
  901   MFEM_FOREACH_THREAD(dz,z,D1D)
 
  903      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  905         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  908            for (
int dy = 0; dy < D1D; ++dy)
 
  910               const real_t By = B(dy,qy);
 
  911               u[0] += XxB(qx,dy,dz) * By;
 
  912               u[1] += XyB(qx,dy,dz) * By;
 
  913               u[2] += XzB(qx,dy,dz) * By;
 
  915            XxBB(qx,qy,dz) = 
u[0];
 
  916            XyBB(qx,qy,dz) = 
u[1];
 
  917            XzBB(qx,qy,dz) = 
u[2];
 
  925template<
int MD1, 
int MQ1>
 
  926MFEM_HOST_DEVICE 
inline void EvalZ(
const int D1D, 
const int Q1D,
 
  927                                   const real_t (&sB)[MQ1*MD1],
 
  928                                   const real_t (&sDQQ)[3][MD1*MQ1*MQ1],
 
  929                                   real_t (&sQQQ)[3][MQ1*MQ1*MQ1])
 
  939   MFEM_FOREACH_THREAD(qz,z,Q1D)
 
  941      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
  943         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
  946            for (
int dz = 0; dz < D1D; ++dz)
 
  948               const real_t Bz = B(dz,qz);
 
  949               u[0] += XxBB(qx,qy,dz) * Bz;
 
  950               u[1] += XyBB(qx,qy,dz) * Bz;
 
  951               u[2] += XzBB(qx,qy,dz) * Bz;
 
  953            XxBBB(qx,qy,qz) = 
u[0];
 
  954            XyBBB(qx,qy,qz) = 
u[1];
 
  955            XzBBB(qx,qy,qz) = 
u[2];
 
  964MFEM_HOST_DEVICE 
inline void PullEval(
const int Q1D,
 
  965                                      const int x, 
const int y, 
const int z,
 
  966                                      const real_t (&sQQQ)[3][MQ1*MQ1*MQ1],
 
  980MFEM_HOST_DEVICE 
inline void PushEval(
const int Q1D,
 
  981                                      const int x, 
const int y, 
const int z,
 
  983                                      real_t (&sQQQ)[3][MQ1*MQ1*MQ1])
 
  995template<
int MD1, 
int MQ1>
 
  996MFEM_HOST_DEVICE 
inline void EvalXt(
const int D1D, 
const int Q1D,
 
  997                                    const real_t (&sB)[MQ1*MD1],
 
  998                                    const real_t (&sQQQ)[3][MQ1*MQ1*MQ1],
 
  999                                    real_t (&sDQQ)[3][MD1*MQ1*MQ1])
 
 1009   MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1011      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1013         MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1015            real_t u[3] = {0.0, 0.0, 0.0};
 
 1016            for (
int qx = 0; qx < Q1D; ++qx)
 
 1018               const real_t Btx = Bt(qx,dx);
 
 1019               u[0] += XxBBB(qx,qy,qz) * Btx;
 
 1020               u[1] += XyBBB(qx,qy,qz) * Btx;
 
 1021               u[2] += XzBBB(qx,qy,qz) * Btx;
 
 1023            XxBB(qz,qy,dx) = 
u[0];
 
 1024            XyBB(qz,qy,dx) = 
u[1];
 
 1025            XzBB(qz,qy,dx) = 
u[2];
 
 1033template<
int MD1, 
int MQ1>
 
 1034MFEM_HOST_DEVICE 
inline void EvalYt(
const int D1D, 
const int Q1D,
 
 1035                                    const real_t (&sB)[MQ1*MD1],
 
 1036                                    const real_t (&sDQQ)[3][MD1*MQ1*MQ1],
 
 1037                                    real_t (&sDDQ)[3][MD1*MD1*MQ1])
 
 1047   MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1049      MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1051         MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1053            real_t u[3] = {0.0, 0.0, 0.0};
 
 1054            for (
int qy = 0; qy < Q1D; ++qy)
 
 1056               const real_t Bty = Bt(qy,dy);
 
 1057               u[0] += XxBB(qz,qy,dx) * Bty;
 
 1058               u[1] += XyBB(qz,qy,dx) * Bty;
 
 1059               u[2] += XzBB(qz,qy,dx) * Bty;
 
 1062            XxB(qz,dy,dx) = 
u[0];
 
 1063            XyB(qz,dy,dx) = 
u[1];
 
 1064            XzB(qz,dy,dx)= 
u[2];
 
 1072template<
int MD1, 
int MQ1>
 
 1073MFEM_HOST_DEVICE 
inline void EvalZt(
const int D1D, 
const int Q1D,
 
 1074                                    const real_t (&sB)[MQ1*MD1],
 
 1075                                    const real_t (&sDDQ)[3][MD1*MD1*MQ1],
 
 1076                                    const DeviceTensor<5> &Y, 
 
 1084   MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1086      MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1088         MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1090            real_t u[3] = {0.0, 0.0, 0.0};
 
 1091            for (
int qz = 0; qz < Q1D; ++qz)
 
 1093               const real_t Btz = Bt(qz,dz);
 
 1094               u[0] += XxB(qz,dy,dx) * Btz;
 
 1095               u[1] += XyB(qz,dy,dx) * Btz;
 
 1096               u[2] += XzB(qz,dy,dx) * Btz;
 
 1098            Y(dx,dy,dz,0,e) += 
u[0];
 
 1099            Y(dx,dy,dz,1,e) += 
u[1];
 
 1100            Y(dx,dy,dz,2,e) += 
u[2];
 
 1107template<
int MD1, 
int MQ1>
 
 1108MFEM_HOST_DEVICE 
inline void GradX(
const int D1D, 
const int Q1D,
 
 1109                                   const real_t (*sBG)[MQ1*MD1],
 
 1110                                   const real_t (*sDDD)[MD1*MD1*MD1],
 
 1111                                   real_t (*sDDQ)[MD1*MD1*MQ1])
 
 1125   MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1127      MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1129         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1131            real_t u[3] = {0.0, 0.0, 0.0};
 
 1132            real_t v[3] = {0.0, 0.0, 0.0};
 
 1133            for (
int dx = 0; dx < D1D; ++dx)
 
 1135               const real_t xx = Xx(dx,dy,dz);
 
 1136               const real_t xy = Xy(dx,dy,dz);
 
 1137               const real_t xz = Xz(dx,dy,dz);
 
 1138               const real_t Bx = B(dx,qx);
 
 1139               const real_t Gx = G(dx,qx);
 
 1148            XxB(qx,dy,dz) = 
u[0];
 
 1149            XyB(qx,dy,dz) = 
u[1];
 
 1150            XzB(qx,dy,dz) = 
u[2];
 
 1152            XxG(qx,dy,dz) = v[0];
 
 1153            XyG(qx,dy,dz) = v[1];
 
 1154            XzG(qx,dy,dz) = v[2];
 
 1162template<
int MD1, 
int MQ1>
 
 1163MFEM_HOST_DEVICE 
inline void GradY(
const int D1D, 
const int Q1D,
 
 1164                                   const real_t (*sBG)[MQ1*MD1],
 
 1165                                   const real_t (*sDDQ)[MD1*MD1*MQ1],
 
 1166                                   real_t (*sDQQ)[MD1*MQ1*MQ1])
 
 1186   MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1188      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1190         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1192            real_t u[3] = {0.0, 0.0, 0.0};
 
 1193            real_t v[3] = {0.0, 0.0, 0.0};
 
 1194            real_t w[3] = {0.0, 0.0, 0.0};
 
 1195            for (
int dy = 0; dy < D1D; ++dy)
 
 1197               const real_t By = B(dy,qy);
 
 1198               const real_t Gy = G(dy,qy);
 
 1200               u[0] += XxB(qx,dy,dz) * By;
 
 1201               u[1] += XyB(qx,dy,dz) * By;
 
 1202               u[2] += XzB(qx,dy,dz) * By;
 
 1204               v[0] += XxG(qx,dy,dz) * By;
 
 1205               v[1] += XyG(qx,dy,dz) * By;
 
 1206               v[2] += XzG(qx,dy,dz) * By;
 
 1208               w[0] += XxB(qx,dy,dz) * Gy;
 
 1209               w[1] += XyB(qx,dy,dz) * Gy;
 
 1210               w[2] += XzB(qx,dy,dz) * Gy;
 
 1212            XxBB(qx,qy,dz) = 
u[0];
 
 1213            XyBB(qx,qy,dz) = 
u[1];
 
 1214            XzBB(qx,qy,dz) = 
u[2];
 
 1216            XxBG(qx,qy,dz) = v[0];
 
 1217            XyBG(qx,qy,dz) = v[1];
 
 1218            XzBG(qx,qy,dz) = v[2];
 
 1220            XxGB(qx,qy,dz) = w[0];
 
 1221            XyGB(qx,qy,dz) = w[1];
 
 1222            XzGB(qx,qy,dz) = w[2];
 
 1230template<
int MD1, 
int MQ1>
 
 1231MFEM_HOST_DEVICE 
inline void GradZ(
const int D1D, 
const int Q1D,
 
 1232                                   const real_t (*sBG)[MQ1*MD1],
 
 1233                                   const real_t (*sDQQ)[MD1*MQ1*MQ1],
 
 1234                                   real_t (*sQQQ)[MQ1*MQ1*MQ1])
 
 1257   MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1259      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1261         MFEM_FOREACH_THREAD(qx,x,Q1D)
 
 1263            real_t u[3] = {0.0, 0.0, 0.0};
 
 1264            real_t v[3] = {0.0, 0.0, 0.0};
 
 1265            real_t w[3] = {0.0, 0.0, 0.0};
 
 1266            for (
int dz = 0; dz < D1D; ++dz)
 
 1268               const real_t Bz = B(dz,qz);
 
 1269               const real_t Gz = G(dz,qz);
 
 1271               u[0] += XxBG(qx,qy,dz) * Bz;
 
 1272               u[1] += XyBG(qx,qy,dz) * Bz;
 
 1273               u[2] += XzBG(qx,qy,dz) * Bz;
 
 1275               v[0] += XxGB(qx,qy,dz) * Bz;
 
 1276               v[1] += XyGB(qx,qy,dz) * Bz;
 
 1277               v[2] += XzGB(qx,qy,dz) * Bz;
 
 1279               w[0] += XxBB(qx,qy,dz) * Gz;
 
 1280               w[1] += XyBB(qx,qy,dz) * Gz;
 
 1281               w[2] += XzBB(qx,qy,dz) * Gz;
 
 1283            XxBBG(qx,qy,qz) = 
u[0];
 
 1284            XyBBG(qx,qy,qz) = 
u[1];
 
 1285            XzBBG(qx,qy,qz) = 
u[2];
 
 1287            XxBGB(qx,qy,qz) = v[0];
 
 1288            XyBGB(qx,qy,qz) = v[1];
 
 1289            XzBGB(qx,qy,qz) = v[2];
 
 1291            XxGBB(qx,qy,qz)= w[0];
 
 1292            XyGBB(qx,qy,qz) = w[1];
 
 1293            XzGBB(qx,qy,qz) = w[2];
 
 1302MFEM_HOST_DEVICE 
inline void PullGrad(
const int Q1D,
 
 1303                                      const int x, 
const int y, 
const int z,
 
 1304                                      const real_t (*sQQQ)[MQ1*MQ1*MQ1],
 
 1317   Jpr[0] = XxBBG(x,y,z);
 
 1318   Jpr[3] = XxBGB(x,y,z);
 
 1319   Jpr[6] = XxGBB(x,y,z);
 
 1320   Jpr[1] = XyBBG(x,y,z);
 
 1321   Jpr[4] = XyBGB(x,y,z);
 
 1322   Jpr[7] = XyGBB(x,y,z);
 
 1323   Jpr[2] = XzBBG(x,y,z);
 
 1324   Jpr[5] = XzBGB(x,y,z);
 
 1325   Jpr[8] = XzGBB(x,y,z);
 
 1330MFEM_HOST_DEVICE 
inline void PushGrad(
const int Q1D,
 
 1331                                      const int x, 
const int y, 
const int z,
 
 1333                                      real_t (&sQQQ)[9][MQ1*MQ1*MQ1])
 
 1345   XxBBG(x,y,z) = A[0];
 
 1346   XxBGB(x,y,z) = A[1];
 
 1347   XxGBB(x,y,z) = A[2];
 
 1348   XyBBG(x,y,z) = A[3];
 
 1349   XyBGB(x,y,z) = A[4];
 
 1350   XyGBB(x,y,z) = A[5];
 
 1351   XzBBG(x,y,z) = A[6];
 
 1352   XzBGB(x,y,z) = A[7];
 
 1353   XzGBB(x,y,z) = A[8];
 
 1357template<
int MD1, 
int MQ1>
 
 1358MFEM_HOST_DEVICE 
inline void GradZt(
const int D1D, 
const int Q1D,
 
 1359                                    const real_t (&sBG)[2][MQ1*MD1],
 
 1360                                    const real_t (&sQQQ)[9][MQ1*MQ1*MQ1],
 
 1361                                    real_t (&sDQQ)[9][MD1*MQ1*MQ1])
 
 1385   MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1387      MFEM_FOREACH_THREAD(qy,y,Q1D)
 
 1389         MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1391            real_t u[3] = {0.0, 0.0, 0.0};
 
 1392            real_t v[3] = {0.0, 0.0, 0.0};
 
 1393            real_t w[3] = {0.0, 0.0, 0.0};
 
 1394            for (
int qx = 0; qx < Q1D; ++qx)
 
 1396               const real_t Btx = Bt(qx,dx);
 
 1397               const real_t Gtx = Gt(qx,dx);
 
 1399               u[0] += XxBBG(qx,qy,qz) * Gtx;
 
 1400               v[0] += XxBGB(qx,qy,qz) * Btx;
 
 1401               w[0] += XxGBB(qx,qy,qz) * Btx;
 
 1403               u[1] += XyBBG(qx,qy,qz) * Gtx;
 
 1404               v[1] += XyBGB(qx,qy,qz) * Btx;
 
 1405               w[1] += XyGBB(qx,qy,qz) * Btx;
 
 1407               u[2] += XzBBG(qx,qy,qz) * Gtx;
 
 1408               v[2] += XzBGB(qx,qy,qz) * Btx;
 
 1409               w[2] += XzGBB(qx,qy,qz) * Btx;
 
 1411            XxBB(qz,qy,dx) = 
u[0];
 
 1412            XxBG(qz,qy,dx) = v[0];
 
 1413            XxGB(qz,qy,dx) = w[0];
 
 1415            XyBB(qz,qy,dx) = 
u[1];
 
 1416            XyBG(qz,qy,dx) = v[1];
 
 1417            XyGB(qz,qy,dx) = w[1];
 
 1419            XzBB(qz,qy,dx) = 
u[2];
 
 1420            XzBG(qz,qy,dx) = v[2];
 
 1421            XzGB(qz,qy,dx) = w[2];
 
 1429template<
int MD1, 
int MQ1>
 
 1430MFEM_HOST_DEVICE 
inline void GradYt(
const int D1D, 
const int Q1D,
 
 1431                                    const real_t (&sBG)[2][MQ1*MD1],
 
 1432                                    const real_t (&sDQQ)[9][MD1*MQ1*MQ1],
 
 1433                                    real_t (&sDDQ)[9][MD1*MD1*MQ1])
 
 1456   MFEM_FOREACH_THREAD(qz,z,Q1D)
 
 1458      MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1460         MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1462            real_t u[3] = {0.0, 0.0, 0.0};
 
 1463            real_t v[3] = {0.0, 0.0, 0.0};
 
 1464            real_t w[3] = {0.0, 0.0, 0.0};
 
 1465            for (
int qy = 0; qy < Q1D; ++qy)
 
 1467               const real_t Bty = Bt(qy,dy);
 
 1468               const real_t Gty = Gt(qy,dy);
 
 1470               u[0] += XxBB(qz,qy,dx) * Bty;
 
 1471               v[0] += XxBG(qz,qy,dx) * Gty;
 
 1472               w[0] += XxGB(qz,qy,dx) * Bty;
 
 1474               u[1] += XyBB(qz,qy,dx) * Bty;
 
 1475               v[1] += XyBG(qz,qy,dx) * Gty;
 
 1476               w[1] += XyGB(qz,qy,dx) * Bty;
 
 1478               u[2] += XzBB(qz,qy,dx) * Bty;
 
 1479               v[2] += XzBG(qz,qy,dx) * Gty;
 
 1480               w[2] += XzGB(qz,qy,dx) * Bty;
 
 1483            XxB(qz,dy,dx) = 
u[0];
 
 1484            XxC(qz,dy,dx) = v[0];
 
 1485            XxG(qz,dy,dx) = w[0];
 
 1487            XyB(qz,dy,dx) = 
u[1];
 
 1488            XyC(qz,dy,dx) = v[1];
 
 1489            XyG(qz,dy,dx) = w[1];
 
 1491            XzB(qz,dy,dx) = 
u[2];
 
 1492            XzC(qz,dy,dx) = v[2];
 
 1493            XzG(qz,dy,dx) = w[2];
 
 1501template<
int MD1, 
int MQ1>
 
 1502MFEM_HOST_DEVICE 
inline void GradXt(
const int D1D, 
const int Q1D,
 
 1503                                    const real_t (&sBG)[2][MQ1*MD1],
 
 1504                                    const real_t (&sDDQ)[9][MD1*MD1*MQ1],
 
 1505                                    const DeviceTensor<5> &Y, 
 
 1520   MFEM_FOREACH_THREAD(dz,z,D1D)
 
 1522      MFEM_FOREACH_THREAD(dy,y,D1D)
 
 1524         MFEM_FOREACH_THREAD(dx,x,D1D)
 
 1526            real_t u[3] = {0.0, 0.0, 0.0};
 
 1527            real_t v[3] = {0.0, 0.0, 0.0};
 
 1528            real_t w[3] = {0.0, 0.0, 0.0};
 
 1529            for (
int qz = 0; qz < Q1D; ++qz)
 
 1531               const real_t Btz = Bt(qz,dz);
 
 1532               const real_t Gtz = Gt(qz,dz);
 
 1534               u[0] += XxB(qz,dy,dx) * Btz;
 
 1535               v[0] += XxC(qz,dy,dx) * Btz;
 
 1536               w[0] += XxG(qz,dy,dx) * Gtz;
 
 1538               u[1] += XyB(qz,dy,dx) * Btz;
 
 1539               v[1] += XyC(qz,dy,dx)* Btz;
 
 1540               w[1] += XyG(qz,dy,dx) * Gtz;
 
 1542               u[2] += XzB(qz,dy,dx) * Btz;
 
 1543               v[2] += XzC(qz,dy,dx) * Btz;
 
 1544               w[2] += XzG(qz,dy,dx) * Gtz;
 
 1546            Y(dx,dy,dz,0,e) += 
u[0] + v[0] + w[0];
 
 1547            Y(dx,dy,dz,1,e) += 
u[1] + v[1] + w[1];
 
 1548            Y(dx,dy,dz,2,e) += 
u[2] + v[2] + w[2];
 
real_t u(const Vector &xvec)
DeviceTensor< 2, real_t > DeviceMatrix
DeviceTensor< 2, const real_t > ConstDeviceMatrix
DeviceTensor< 3, const real_t > ConstDeviceCube
DeviceTensor< 3, real_t > DeviceCube