22static void PADGTraceSetup2D(
const int Q1D,
 
   24                             const Array<real_t> &w,
 
   35   auto d = 
Reshape(det.Read(), Q1D, NF);
 
   36   auto n = 
Reshape(nor.Read(), Q1D, VDIM, NF);
 
   37   const bool const_r = rho.Size() == 1;
 
   40   const bool const_v = 
vel.Size() == 2;
 
   44   auto qd = 
Reshape(op.Write(), Q1D, 2, 2, NF);
 
   48      const int f = tid / Q1D;
 
   49      const int q = tid % Q1D;
 
   51         const real_t r = const_r ? R(0,0) : R(q,
f);
 
   52         const real_t v0 = const_v ? V(0,0,0) : V(0,q,
f);
 
   53         const real_t v1 = const_v ? V(1,0,0) : V(1,q,
f);
 
   54         const real_t dot = n(q,0,
f) * v0 + n(q,1,
f) * v1;
 
   55         const real_t abs = dot > 0_r ? dot : -dot;
 
   57         qd(q,0,0,
f) = w*( 
alpha/2 * dot + 
beta * abs );
 
   58         qd(q,1,0,
f) = w*( 
alpha/2 * dot - 
beta * abs );
 
   59         qd(q,0,1,
f) = w*(-
alpha/2 * dot - 
beta * abs );
 
   60         qd(q,1,1,
f) = w*(-
alpha/2 * dot + 
beta * abs );
 
   65static void PADGTraceSetup3D(
const int Q1D,
 
   67                             const Array<real_t> &w,
 
   78   auto d = 
Reshape(det.Read(), Q1D, Q1D, NF);
 
   79   auto n = 
Reshape(nor.Read(), Q1D, Q1D, VDIM, NF);
 
   80   const bool const_r = rho.Size() == 1;
 
   83   const bool const_v = 
vel.Size() == 3;
 
   87   auto qd = 
Reshape(op.Write(), Q1D, Q1D, 2, 2, NF);
 
   91      int f = tid / (Q1D * Q1D);
 
   92      int q2 = (tid / Q1D) % Q1D;
 
   96            const real_t r = const_r ? R(0,0,0) : R(q1,q2,
f);
 
   97            const real_t v0 = const_v ? V(0,0,0,0) : V(0,q1,q2,
f);
 
   98            const real_t v1 = const_v ? V(1,0,0,0) : V(1,q1,q2,
f);
 
   99            const real_t v2 = const_v ? V(2,0,0,0) : V(2,q1,q2,
f);
 
  100            const real_t dot = n(q1,q2,0,
f) * v0 + n(q1,q2,1,
f) * v1 +
 
  102            const real_t abs = dot > 0.0 ? dot : -dot;
 
  103            const real_t w = W[q1+q2*Q1D]*r*d(q1,q2,
f);
 
  104            qd(q1,q2,0,0,
f) = w*( 
alpha/2 * dot + 
beta * abs );
 
  105            qd(q1,q2,1,0,
f) = w*( 
alpha/2 * dot - 
beta * abs );
 
  106            qd(q1,q2,0,1,
f) = w*(-
alpha/2 * dot - 
beta * abs );
 
  107            qd(q1,q2,1,1,
f) = w*(-
alpha/2 * dot + 
beta * abs );
 
  113static void PADGTraceSetup(
const int dim,
 
  117                           const Array<real_t> &W,
 
  126   if (
dim == 1) { MFEM_ABORT(
"dim==1 not supported in PADGTraceSetup"); }
 
  129      PADGTraceSetup2D(Q1D, NF, W, det, nor, rho, 
u, 
alpha, 
beta, op);
 
  133      PADGTraceSetup3D(Q1D, NF, W, det, nor, rho, 
u, 
alpha, 
beta, op);
 
  137void DGTraceIntegrator::SetupPA(
const FiniteElementSpace &fes, 
FaceType type)
 
  142   nf = fes.GetNFbyType(type);
 
  143   if (
nf==0) { 
return; }
 
  145   Mesh *mesh = fes.GetMesh();
 
  146   const FiniteElement &el = *fes.GetTypicalTraceElement();
 
  147   const IntegrationRule *ir = 
IntRule?
 
  149                               &
GetRule(el.GetGeomType(), el.GetOrder(),
 
  150                                        *mesh->GetTypicalElementTransformation());
 
  151   const int symmDims = 4;
 
  152   nq = ir->GetNPoints();
 
  153   dim = mesh->Dimension();
 
  154   geom = mesh->GetFaceGeometricFactors(
 
  163   FaceQuadratureSpace qs(*mesh, *ir, type);
 
  171   else if (ConstantCoefficient *const_rho = 
dynamic_cast<ConstantCoefficient*
> 
  174      r.SetConstant(const_rho->constant);
 
  176   else if (QuadratureFunctionCoefficient* qf_rho =
 
  177               dynamic_cast<QuadratureFunctionCoefficient*
>(
rho))
 
  179      r.MakeRef(qf_rho->GetQuadFunction());
 
  188      for (
int f = 0; 
f < mesh->GetNumFacesWithGhost(); ++
f)
 
  190         Mesh::FaceInformation face = mesh->GetFaceInformation(
f);
 
  191         if (face.IsNonconformingCoarse() || !face.IsOfFaceType(type))
 
  197         FaceElementTransformations &T =
 
  198            *fes.GetMesh()->GetFaceElementTransformations(
f);
 
  199         for (
int q = 0; q < 
nq; ++q)
 
  205            T.SetAllIntPoints(&ir->IntPoint(q));
 
  206            const IntegrationPoint &eip1 = T.GetElement1IntPoint();
 
  207            const IntegrationPoint &eip2 = T.GetElement2IntPoint();
 
  210            if (face.IsBoundary())
 
  212               rq = 
rho->
Eval(*T.Elem1, eip1);
 
  217               for (
int d=0; d<
dim; ++d)
 
  219                  udotn += C_vel(d,iq,f_ind)*n(iq,d,f_ind);
 
  221               if (udotn >= 0.0) { rq = 
rho->
Eval(*T.Elem2, eip2); }
 
  222               else { rq = 
rho->
Eval(*T.Elem1, eip1); }
 
  228      MFEM_VERIFY(f_ind==
nf, 
"Incorrect number of faces.");
 
  246template<
int T_D1D = 0, 
int T_Q1D = 0> 
static 
  247void PADGTraceApply2D(
const int NF,
 
  257   const int D1D = T_D1D ? T_D1D : d1d;
 
  258   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  261   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  270      const int D1D = T_D1D ? T_D1D : d1d;
 
  271      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  273      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  274      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  277      for (
int d = 0; d < D1D; d++)
 
  279         for (
int c = 0; c < VDIM; c++)
 
  281            u0[d][c] = x(d,c,0,
f);
 
  282            u1[d][c] = x(d,c,1,
f);
 
  285      real_t Bu0[max_Q1D][VDIM];
 
  286      real_t Bu1[max_Q1D][VDIM];
 
  287      for (
int q = 0; q < Q1D; ++q)
 
  289         for (
int c = 0; c < VDIM; c++)
 
  294         for (
int d = 0; d < D1D; ++d)
 
  297            for (
int c = 0; c < VDIM; c++)
 
  299               Bu0[q][c] += 
b*u0[d][c];
 
  300               Bu1[q][c] += 
b*u1[d][c];
 
  304      real_t DBu[max_Q1D][VDIM];
 
  305      for (
int q = 0; q < Q1D; ++q)
 
  307         for (
int c = 0; c < VDIM; c++)
 
  309            DBu[q][c] = op(q,0,0,
f)*Bu0[q][c] + op(q,1,0,
f)*Bu1[q][c];
 
  312      real_t BDBu[max_D1D][VDIM];
 
  313      for (
int d = 0; d < D1D; ++d)
 
  315         for (
int c = 0; c < VDIM; c++)
 
  319         for (
int q = 0; q < Q1D; ++q)
 
  322            for (
int c = 0; c < VDIM; c++)
 
  324               BDBu[d][c] += 
b*DBu[q][c];
 
  327         for (
int c = 0; c < VDIM; c++)
 
  329            y(d,c,0,
f) +=  BDBu[d][c];
 
  330            y(d,c,1,
f) += -BDBu[d][c];
 
  337template<
int T_D1D = 0, 
int T_Q1D = 0> 
static 
  338void PADGTraceApply3D(
const int NF,
 
  339                      const Array<real_t> &
b,
 
  340                      const Array<real_t> &bt,
 
  348   const int D1D = T_D1D ? T_D1D : d1d;
 
  349   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  352   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  353   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
  354   auto op = 
Reshape(op_.Read(), Q1D, Q1D, 2, 2, NF);
 
  355   auto x = 
Reshape(x_.Read(), D1D, D1D, VDIM, 2, NF);
 
  356   auto y = 
Reshape(y_.ReadWrite(), D1D, D1D, VDIM, 2, NF);
 
  361      const int D1D = T_D1D ? T_D1D : d1d;
 
  362      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  364      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  365      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  366      real_t u0[max_D1D][max_D1D][VDIM];
 
  367      real_t u1[max_D1D][max_D1D][VDIM];
 
  368      for (
int d1 = 0; d1 < D1D; d1++)
 
  370         for (
int d2 = 0; d2 < D1D; d2++)
 
  372            for (
int c = 0; c < VDIM; c++)
 
  374               u0[d1][d2][c] = x(d1,d2,c,0,
f);
 
  375               u1[d1][d2][c] = x(d1,d2,c,1,
f);
 
  379      real_t Bu0[max_Q1D][max_D1D][VDIM];
 
  380      real_t Bu1[max_Q1D][max_D1D][VDIM];
 
  381      for (
int q = 0; q < Q1D; ++q)
 
  383         for (
int d2 = 0; d2 < D1D; d2++)
 
  385            for (
int c = 0; c < VDIM; c++)
 
  390            for (
int d1 = 0; d1 < D1D; ++d1)
 
  393               for (
int c = 0; c < VDIM; c++)
 
  395                  Bu0[q][d2][c] += 
b*u0[d1][d2][c];
 
  396                  Bu1[q][d2][c] += 
b*u1[d1][d2][c];
 
  401      real_t BBu0[max_Q1D][max_Q1D][VDIM];
 
  402      real_t BBu1[max_Q1D][max_Q1D][VDIM];
 
  403      for (
int q1 = 0; q1 < Q1D; ++q1)
 
  405         for (
int q2 = 0; q2 < Q1D; q2++)
 
  407            for (
int c = 0; c < VDIM; c++)
 
  409               BBu0[q1][q2][c] = 0.0;
 
  410               BBu1[q1][q2][c] = 0.0;
 
  412            for (
int d2 = 0; d2 < D1D; ++d2)
 
  415               for (
int c = 0; c < VDIM; c++)
 
  417                  BBu0[q1][q2][c] += 
b*Bu0[q1][d2][c];
 
  418                  BBu1[q1][q2][c] += 
b*Bu1[q1][d2][c];
 
  423      real_t DBBu[max_Q1D][max_Q1D][VDIM];
 
  424      for (
int q1 = 0; q1 < Q1D; ++q1)
 
  426         for (
int q2 = 0; q2 < Q1D; q2++)
 
  428            for (
int c = 0; c < VDIM; c++)
 
  430               DBBu[q1][q2][c] = op(q1,q2,0,0,
f)*BBu0[q1][q2][c] +
 
  431                                 op(q1,q2,1,0,
f)*BBu1[q1][q2][c];
 
  435      real_t BDBBu[max_Q1D][max_D1D][VDIM];
 
  436      for (
int q1 = 0; q1 < Q1D; ++q1)
 
  438         for (
int d2 = 0; d2 < D1D; d2++)
 
  440            for (
int c = 0; c < VDIM; c++)
 
  442               BDBBu[q1][d2][c] = 0.0;
 
  444            for (
int q2 = 0; q2 < Q1D; ++q2)
 
  447               for (
int c = 0; c < VDIM; c++)
 
  449                  BDBBu[q1][d2][c] += 
b*DBBu[q1][q2][c];
 
  454      real_t BBDBBu[max_D1D][max_D1D][VDIM];
 
  455      for (
int d1 = 0; d1 < D1D; ++d1)
 
  457         for (
int d2 = 0; d2 < D1D; d2++)
 
  459            for (
int c = 0; c < VDIM; c++)
 
  461               BBDBBu[d1][d2][c] = 0.0;
 
  463            for (
int q1 = 0; q1 < Q1D; ++q1)
 
  466               for (
int c = 0; c < VDIM; c++)
 
  468                  BBDBBu[d1][d2][c] += 
b*BDBBu[q1][d2][c];
 
  471            for (
int c = 0; c < VDIM; c++)
 
  473               y(d1,d2,c,0,
f) +=  BBDBBu[d1][d2][c];
 
  474               y(d1,d2,c,1,
f) += -BBDBBu[d1][d2][c];
 
  482template<
int T_D1D = 0, 
int T_Q1D = 0, 
int T_NBZ = 0> 
static 
  483void SmemPADGTraceApply3D(
const int NF,
 
  484                          const Array<real_t> &
b,
 
  485                          const Array<real_t> &bt,
 
  492   const int D1D = T_D1D ? T_D1D : d1d;
 
  493   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  494   constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
  497   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  498   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
  499   auto op = 
Reshape(op_.Read(), Q1D, Q1D, 2, 2, NF);
 
  500   auto x = 
Reshape(x_.Read(), D1D, D1D, 2, NF);
 
  501   auto y = 
Reshape(y_.ReadWrite(), D1D, D1D, 2, NF);
 
  505      const int tidz = MFEM_THREAD_ID(z);
 
  506      const int D1D = T_D1D ? T_D1D : d1d;
 
  507      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  509      constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
  510      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  511      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  512      MFEM_SHARED 
real_t u0[NBZ][max_D1D][max_D1D];
 
  513      MFEM_SHARED 
real_t u1[NBZ][max_D1D][max_D1D];
 
  514      MFEM_FOREACH_THREAD(d1,x,D1D)
 
  516         MFEM_FOREACH_THREAD(d2,y,D1D)
 
  518            u0[tidz][d1][d2] = x(d1,d2,0,
f);
 
  519            u1[tidz][d1][d2] = x(d1,d2,1,
f);
 
  523      MFEM_SHARED 
real_t Bu0[NBZ][max_Q1D][max_D1D];
 
  524      MFEM_SHARED 
real_t Bu1[NBZ][max_Q1D][max_D1D];
 
  525      MFEM_FOREACH_THREAD(q1,x,Q1D)
 
  527         MFEM_FOREACH_THREAD(d2,y,D1D)
 
  531            for (
int d1 = 0; d1 < D1D; ++d1)
 
  534               Bu0_ += 
b*u0[tidz][d1][d2];
 
  535               Bu1_ += 
b*u1[tidz][d1][d2];
 
  537            Bu0[tidz][q1][d2] = Bu0_;
 
  538            Bu1[tidz][q1][d2] = Bu1_;
 
  542      MFEM_SHARED 
real_t BBu0[NBZ][max_Q1D][max_Q1D];
 
  543      MFEM_SHARED 
real_t BBu1[NBZ][max_Q1D][max_Q1D];
 
  544      MFEM_FOREACH_THREAD(q1,x,Q1D)
 
  546         MFEM_FOREACH_THREAD(q2,y,Q1D)
 
  550            for (
int d2 = 0; d2 < D1D; ++d2)
 
  553               BBu0_ += 
b*Bu0[tidz][q1][d2];
 
  554               BBu1_ += 
b*Bu1[tidz][q1][d2];
 
  556            BBu0[tidz][q1][q2] = BBu0_;
 
  557            BBu1[tidz][q1][q2] = BBu1_;
 
  561      MFEM_SHARED 
real_t DBBu[NBZ][max_Q1D][max_Q1D];
 
  562      MFEM_FOREACH_THREAD(q1,x,Q1D)
 
  564         MFEM_FOREACH_THREAD(q2,y,Q1D)
 
  566            DBBu[tidz][q1][q2] = op(q1,q2,0,0,
f)*BBu0[tidz][q1][q2] +
 
  567                                 op(q1,q2,1,0,
f)*BBu1[tidz][q1][q2];
 
  571      MFEM_SHARED 
real_t BDBBu[NBZ][max_Q1D][max_D1D];
 
  572      MFEM_FOREACH_THREAD(q1,x,Q1D)
 
  574         MFEM_FOREACH_THREAD(d2,y,D1D)
 
  577            for (
int q2 = 0; q2 < Q1D; ++q2)
 
  580               BDBBu_ += 
b*DBBu[tidz][q1][q2];
 
  582            BDBBu[tidz][q1][d2] = BDBBu_;
 
  586      MFEM_FOREACH_THREAD(d1,x,D1D)
 
  588         MFEM_FOREACH_THREAD(d2,y,D1D)
 
  591            for (
int q1 = 0; q1 < Q1D; ++q1)
 
  594               BBDBBu_ += 
b*BDBBu[tidz][q1][d2];
 
  596            y(d1,d2,0,
f) +=  BBDBBu_;
 
  597            y(d1,d2,1,
f) += -BBDBBu_;
 
  603static void PADGTraceApply(
const int dim,
 
  607                           const Array<real_t> &B,
 
  608                           const Array<real_t> &Bt,
 
  615      switch ((D1D << 4 ) | Q1D)
 
  617         case 0x22: 
return PADGTraceApply2D<2,2>(NF,B,Bt,op,x,y);
 
  618         case 0x33: 
return PADGTraceApply2D<3,3>(NF,B,Bt,op,x,y);
 
  619         case 0x44: 
return PADGTraceApply2D<4,4>(NF,B,Bt,op,x,y);
 
  620         case 0x55: 
return PADGTraceApply2D<5,5>(NF,B,Bt,op,x,y);
 
  621         case 0x66: 
return PADGTraceApply2D<6,6>(NF,B,Bt,op,x,y);
 
  622         case 0x77: 
return PADGTraceApply2D<7,7>(NF,B,Bt,op,x,y);
 
  623         case 0x88: 
return PADGTraceApply2D<8,8>(NF,B,Bt,op,x,y);
 
  624         case 0x99: 
return PADGTraceApply2D<9,9>(NF,B,Bt,op,x,y);
 
  625         default:   
return PADGTraceApply2D(NF,B,Bt,op,x,y,D1D,Q1D);
 
  630      switch ((D1D << 4 ) | Q1D)
 
  632         case 0x22: 
return SmemPADGTraceApply3D<2,2,1>(NF,B,Bt,op,x,y);
 
  633         case 0x23: 
return SmemPADGTraceApply3D<2,3,1>(NF,B,Bt,op,x,y);
 
  634         case 0x34: 
return SmemPADGTraceApply3D<3,4,2>(NF,B,Bt,op,x,y);
 
  635         case 0x45: 
return SmemPADGTraceApply3D<4,5,2>(NF,B,Bt,op,x,y);
 
  636         case 0x56: 
return SmemPADGTraceApply3D<5,6,1>(NF,B,Bt,op,x,y);
 
  637         case 0x67: 
return SmemPADGTraceApply3D<6,7,1>(NF,B,Bt,op,x,y);
 
  638         case 0x78: 
return SmemPADGTraceApply3D<7,8,1>(NF,B,Bt,op,x,y);
 
  639         case 0x89: 
return SmemPADGTraceApply3D<8,9,1>(NF,B,Bt,op,x,y);
 
  640         default:   
return PADGTraceApply3D(NF,B,Bt,op,x,y,D1D,Q1D);
 
  643   MFEM_ABORT(
"Unknown kernel.");
 
  647template<
int T_D1D = 0, 
int T_Q1D = 0> 
static 
  648void PADGTraceApplyTranspose2D(
const int NF,
 
  649                               const Array<real_t> &
b,
 
  650                               const Array<real_t> &bt,
 
  658   const int D1D = T_D1D ? T_D1D : d1d;
 
  659   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  662   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  663   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
  664   auto op = 
Reshape(op_.Read(), Q1D, 2, 2, NF);
 
  665   auto x = 
Reshape(x_.Read(), D1D, VDIM, 2, NF);
 
  666   auto y = 
Reshape(y_.ReadWrite(), D1D, VDIM, 2, NF);
 
  671      const int D1D = T_D1D ? T_D1D : d1d;
 
  672      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  674      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  675      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  678      for (
int d = 0; d < D1D; d++)
 
  680         for (
int c = 0; c < VDIM; c++)
 
  682            u0[d][c] = x(d,c,0,
f);
 
  683            u1[d][c] = x(d,c,1,
f);
 
  686      real_t Bu0[max_Q1D][VDIM];
 
  687      real_t Bu1[max_Q1D][VDIM];
 
  688      for (
int q = 0; q < Q1D; ++q)
 
  690         for (
int c = 0; c < VDIM; c++)
 
  695         for (
int d = 0; d < D1D; ++d)
 
  698            for (
int c = 0; c < VDIM; c++)
 
  700               Bu0[q][c] += 
b*u0[d][c];
 
  701               Bu1[q][c] += 
b*u1[d][c];
 
  705      real_t DBu0[max_Q1D][VDIM];
 
  706      real_t DBu1[max_Q1D][VDIM];
 
  707      for (
int q = 0; q < Q1D; ++q)
 
  709         for (
int c = 0; c < VDIM; c++)
 
  711            DBu0[q][c] = op(q,0,0,
f)*Bu0[q][c] + op(q,0,1,
f)*Bu1[q][c];
 
  712            DBu1[q][c] = op(q,1,0,
f)*Bu0[q][c] + op(q,1,1,
f)*Bu1[q][c];
 
  715      real_t BDBu0[max_D1D][VDIM];
 
  716      real_t BDBu1[max_D1D][VDIM];
 
  717      for (
int d = 0; d < D1D; ++d)
 
  719         for (
int c = 0; c < VDIM; c++)
 
  724         for (
int q = 0; q < Q1D; ++q)
 
  727            for (
int c = 0; c < VDIM; c++)
 
  729               BDBu0[d][c] += 
b*DBu0[q][c];
 
  730               BDBu1[d][c] += 
b*DBu1[q][c];
 
  733         for (
int c = 0; c < VDIM; c++)
 
  735            y(d,c,0,
f) += BDBu0[d][c];
 
  736            y(d,c,1,
f) += BDBu1[d][c];
 
  743template<
int T_D1D = 0, 
int T_Q1D = 0> 
static 
  744void PADGTraceApplyTranspose3D(
const int NF,
 
  745                               const Array<real_t> &
b,
 
  746                               const Array<real_t> &bt,
 
  754   const int D1D = T_D1D ? T_D1D : d1d;
 
  755   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  758   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  759   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
  760   auto op = 
Reshape(op_.Read(), Q1D, Q1D, 2, 2, NF);
 
  761   auto x = 
Reshape(x_.Read(), D1D, D1D, VDIM, 2, NF);
 
  762   auto y = 
Reshape(y_.ReadWrite(), D1D, D1D, VDIM, 2, NF);
 
  767      const int D1D = T_D1D ? T_D1D : d1d;
 
  768      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  770      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  771      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  772      real_t u0[max_D1D][max_D1D][VDIM];
 
  773      real_t u1[max_D1D][max_D1D][VDIM];
 
  774      for (
int d1 = 0; d1 < D1D; d1++)
 
  776         for (
int d2 = 0; d2 < D1D; d2++)
 
  778            for (
int c = 0; c < VDIM; c++)
 
  780               u0[d1][d2][c] = x(d1,d2,c,0,
f);
 
  781               u1[d1][d2][c] = x(d1,d2,c,1,
f);
 
  785      real_t Bu0[max_Q1D][max_D1D][VDIM];
 
  786      real_t Bu1[max_Q1D][max_D1D][VDIM];
 
  787      for (
int q1 = 0; q1 < Q1D; ++q1)
 
  789         for (
int d2 = 0; d2 < D1D; ++d2)
 
  791            for (
int c = 0; c < VDIM; c++)
 
  793               Bu0[q1][d2][c] = 0.0;
 
  794               Bu1[q1][d2][c] = 0.0;
 
  796            for (
int d1 = 0; d1 < D1D; ++d1)
 
  799               for (
int c = 0; c < VDIM; c++)
 
  801                  Bu0[q1][d2][c] += 
b*u0[d1][d2][c];
 
  802                  Bu1[q1][d2][c] += 
b*u1[d1][d2][c];
 
  807      real_t BBu0[max_Q1D][max_Q1D][VDIM];
 
  808      real_t BBu1[max_Q1D][max_Q1D][VDIM];
 
  809      for (
int q1 = 0; q1 < Q1D; ++q1)
 
  811         for (
int q2 = 0; q2 < Q1D; ++q2)
 
  813            for (
int c = 0; c < VDIM; c++)
 
  815               BBu0[q1][q2][c] = 0.0;
 
  816               BBu1[q1][q2][c] = 0.0;
 
  818            for (
int d2 = 0; d2 < D1D; ++d2)
 
  821               for (
int c = 0; c < VDIM; c++)
 
  823                  BBu0[q1][q2][c] += 
b*Bu0[q1][d2][c];
 
  824                  BBu1[q1][q2][c] += 
b*Bu1[q1][d2][c];
 
  829      real_t DBu0[max_Q1D][max_Q1D][VDIM];
 
  830      real_t DBu1[max_Q1D][max_Q1D][VDIM];
 
  831      for (
int q1 = 0; q1 < Q1D; ++q1)
 
  833         for (
int q2 = 0; q2 < Q1D; ++q2)
 
  835            const real_t D00 = op(q1,q2,0,0,
f);
 
  836            const real_t D01 = op(q1,q2,0,1,
f);
 
  837            const real_t D10 = op(q1,q2,1,0,
f);
 
  838            const real_t D11 = op(q1,q2,1,1,
f);
 
  839            for (
int c = 0; c < VDIM; c++)
 
  841               DBu0[q1][q2][c] = D00*BBu0[q1][q2][c] + D01*BBu1[q1][q2][c];
 
  842               DBu1[q1][q2][c] = D10*BBu0[q1][q2][c] + D11*BBu1[q1][q2][c];
 
  846      real_t BDBu0[max_D1D][max_Q1D][VDIM];
 
  847      real_t BDBu1[max_D1D][max_Q1D][VDIM];
 
  848      for (
int d1 = 0; d1 < D1D; ++d1)
 
  850         for (
int q2 = 0; q2 < Q1D; ++q2)
 
  852            for (
int c = 0; c < VDIM; c++)
 
  854               BDBu0[d1][q2][c] = 0.0;
 
  855               BDBu1[d1][q2][c] = 0.0;
 
  857            for (
int q1 = 0; q1 < Q1D; ++q1)
 
  860               for (
int c = 0; c < VDIM; c++)
 
  862                  BDBu0[d1][q2][c] += 
b*DBu0[q1][q2][c];
 
  863                  BDBu1[d1][q2][c] += 
b*DBu1[q1][q2][c];
 
  868      real_t BBDBu0[max_D1D][max_D1D][VDIM];
 
  869      real_t BBDBu1[max_D1D][max_D1D][VDIM];
 
  870      for (
int d1 = 0; d1 < D1D; ++d1)
 
  872         for (
int d2 = 0; d2 < D1D; ++d2)
 
  874            for (
int c = 0; c < VDIM; c++)
 
  876               BBDBu0[d1][d2][c] = 0.0;
 
  877               BBDBu1[d1][d2][c] = 0.0;
 
  879            for (
int q2 = 0; q2 < Q1D; ++q2)
 
  882               for (
int c = 0; c < VDIM; c++)
 
  884                  BBDBu0[d1][d2][c] += 
b*BDBu0[d1][q2][c];
 
  885                  BBDBu1[d1][d2][c] += 
b*BDBu1[d1][q2][c];
 
  888            for (
int c = 0; c < VDIM; c++)
 
  890               y(d1,d2,c,0,
f) += BBDBu0[d1][d2][c];
 
  891               y(d1,d2,c,1,
f) += BBDBu1[d1][d2][c];
 
  899template<
int T_D1D = 0, 
int T_Q1D = 0, 
int T_NBZ = 0> 
static 
  900void SmemPADGTraceApplyTranspose3D(
const int NF,
 
  901                                   const Array<real_t> &
b,
 
  902                                   const Array<real_t> &bt,
 
  909   const int D1D = T_D1D ? T_D1D : d1d;
 
  910   const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  911   constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
  914   auto B = 
Reshape(
b.Read(), Q1D, D1D);
 
  915   auto Bt = 
Reshape(bt.Read(), D1D, Q1D);
 
  916   auto op = 
Reshape(op_.Read(), Q1D, Q1D, 2, 2, NF);
 
  917   auto x = 
Reshape(x_.Read(), D1D, D1D, 2, NF);
 
  918   auto y = 
Reshape(y_.ReadWrite(), D1D, D1D, 2, NF);
 
  922      const int tidz = MFEM_THREAD_ID(z);
 
  923      const int D1D = T_D1D ? T_D1D : d1d;
 
  924      const int Q1D = T_Q1D ? T_Q1D : q1d;
 
  926      constexpr int NBZ = T_NBZ ? T_NBZ : 1;
 
  927      constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
 
  928      constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
 
  929      MFEM_SHARED 
real_t u0[NBZ][max_D1D][max_D1D];
 
  930      MFEM_SHARED 
real_t u1[NBZ][max_D1D][max_D1D];
 
  931      MFEM_FOREACH_THREAD(d1,x,D1D)
 
  933         MFEM_FOREACH_THREAD(d2,y,D1D)
 
  935            u0[tidz][d1][d2] = x(d1,d2,0,
f);
 
  936            u1[tidz][d1][d2] = x(d1,d2,1,
f);
 
  940      MFEM_SHARED 
real_t Bu0[NBZ][max_Q1D][max_D1D];
 
  941      MFEM_SHARED 
real_t Bu1[NBZ][max_Q1D][max_D1D];
 
  942      MFEM_FOREACH_THREAD(q1,x,Q1D)
 
  944         MFEM_FOREACH_THREAD(d2,y,D1D)
 
  948            for (
int d1 = 0; d1 < D1D; ++d1)
 
  951               Bu0_ += 
b*u0[tidz][d1][d2];
 
  952               Bu1_ += 
b*u1[tidz][d1][d2];
 
  954            Bu0[tidz][q1][d2] = Bu0_;
 
  955            Bu1[tidz][q1][d2] = Bu1_;
 
  959      MFEM_SHARED 
real_t BBu0[NBZ][max_Q1D][max_Q1D];
 
  960      MFEM_SHARED 
real_t BBu1[NBZ][max_Q1D][max_Q1D];
 
  961      MFEM_FOREACH_THREAD(q1,x,Q1D)
 
  963         MFEM_FOREACH_THREAD(q2,y,Q1D)
 
  967            for (
int d2 = 0; d2 < D1D; ++d2)
 
  970               BBu0_ += 
b*Bu0[tidz][q1][d2];
 
  971               BBu1_ += 
b*Bu1[tidz][q1][d2];
 
  973            BBu0[tidz][q1][q2] = BBu0_;
 
  974            BBu1[tidz][q1][q2] = BBu1_;
 
  978      MFEM_SHARED 
real_t DBBu0[NBZ][max_Q1D][max_Q1D];
 
  979      MFEM_SHARED 
real_t DBBu1[NBZ][max_Q1D][max_Q1D];
 
  980      MFEM_FOREACH_THREAD(q1,x,Q1D)
 
  982         MFEM_FOREACH_THREAD(q2,y,Q1D)
 
  984            const real_t D00 = op(q1,q2,0,0,
f);
 
  985            const real_t D01 = op(q1,q2,0,1,
f);
 
  986            const real_t D10 = op(q1,q2,1,0,
f);
 
  987            const real_t D11 = op(q1,q2,1,1,
f);
 
  988            const real_t u0q = BBu0[tidz][q1][q2];
 
  989            const real_t u1q = BBu1[tidz][q1][q2];
 
  990            DBBu0[tidz][q1][q2] = D00*u0q + D01*u1q;
 
  991            DBBu1[tidz][q1][q2] = D10*u0q + D11*u1q;
 
  995      MFEM_SHARED 
real_t BDBBu0[NBZ][max_Q1D][max_D1D];
 
  996      MFEM_SHARED 
real_t BDBBu1[NBZ][max_Q1D][max_D1D];
 
  997      MFEM_FOREACH_THREAD(q1,x,Q1D)
 
  999         MFEM_FOREACH_THREAD(d2,y,D1D)
 
 1003            for (
int q2 = 0; q2 < Q1D; ++q2)
 
 1006               BDBBu0_ += 
b*DBBu0[tidz][q1][q2];
 
 1007               BDBBu1_ += 
b*DBBu1[tidz][q1][q2];
 
 1009            BDBBu0[tidz][q1][d2] = BDBBu0_;
 
 1010            BDBBu1[tidz][q1][d2] = BDBBu1_;
 
 1014      MFEM_FOREACH_THREAD(d1,x,D1D)
 
 1016         MFEM_FOREACH_THREAD(d2,y,D1D)
 
 1020            for (
int q1 = 0; q1 < Q1D; ++q1)
 
 1023               BBDBBu0_ += 
b*BDBBu0[tidz][q1][d2];
 
 1024               BBDBBu1_ += 
b*BDBBu1[tidz][q1][d2];
 
 1026            y(d1,d2,0,
f) += BBDBBu0_;
 
 1027            y(d1,d2,1,
f) += BBDBBu1_;
 
 1033static void PADGTraceApplyTranspose(
const int dim,
 
 1037                                    const Array<real_t> &B,
 
 1038                                    const Array<real_t> &Bt,
 
 1045      switch ((D1D << 4 ) | Q1D)
 
 1047         case 0x22: 
return PADGTraceApplyTranspose2D<2,2>(NF,B,Bt,op,x,y);
 
 1048         case 0x33: 
return PADGTraceApplyTranspose2D<3,3>(NF,B,Bt,op,x,y);
 
 1049         case 0x44: 
return PADGTraceApplyTranspose2D<4,4>(NF,B,Bt,op,x,y);
 
 1050         case 0x55: 
return PADGTraceApplyTranspose2D<5,5>(NF,B,Bt,op,x,y);
 
 1051         case 0x66: 
return PADGTraceApplyTranspose2D<6,6>(NF,B,Bt,op,x,y);
 
 1052         case 0x77: 
return PADGTraceApplyTranspose2D<7,7>(NF,B,Bt,op,x,y);
 
 1053         case 0x88: 
return PADGTraceApplyTranspose2D<8,8>(NF,B,Bt,op,x,y);
 
 1054         case 0x99: 
return PADGTraceApplyTranspose2D<9,9>(NF,B,Bt,op,x,y);
 
 1055         default: 
return PADGTraceApplyTranspose2D(NF,B,Bt,op,x,y,D1D,Q1D);
 
 1060      switch ((D1D << 4 ) | Q1D)
 
 1062         case 0x22: 
return SmemPADGTraceApplyTranspose3D<2,2>(NF,B,Bt,op,x,y);
 
 1063         case 0x23: 
return SmemPADGTraceApplyTranspose3D<2,3>(NF,B,Bt,op,x,y);
 
 1064         case 0x34: 
return SmemPADGTraceApplyTranspose3D<3,4>(NF,B,Bt,op,x,y);
 
 1065         case 0x45: 
return SmemPADGTraceApplyTranspose3D<4,5>(NF,B,Bt,op,x,y);
 
 1066         case 0x56: 
return SmemPADGTraceApplyTranspose3D<5,6>(NF,B,Bt,op,x,y);
 
 1067         case 0x67: 
return SmemPADGTraceApplyTranspose3D<6,7>(NF,B,Bt,op,x,y);
 
 1068         case 0x78: 
return SmemPADGTraceApplyTranspose3D<7,8>(NF,B,Bt,op,x,y);
 
 1069         case 0x89: 
return SmemPADGTraceApplyTranspose3D<8,9>(NF,B,Bt,op,x,y);
 
 1070         default: 
return PADGTraceApplyTranspose3D(NF,B,Bt,op,x,y,D1D,Q1D);
 
 1073   MFEM_ABORT(
"Unknown kernel.");
 
const T * Read(bool on_dev=true) const
Shortcut for mfem::Read(a.GetMemory(), a.Size(), on_dev).
virtual real_t Eval(ElementTransformation &T, const IntegrationPoint &ip)=0
Evaluate the coefficient in the element described by T at the point ip.
void AddMultPA(const Vector &, Vector &) const override
Method for partially assembled action.
void AssemblePABoundaryFaces(const FiniteElementSpace &fes) override
void AddMultTransposePA(const Vector &x, Vector &y) const override
Method for partially assembled transposed action.
void AssemblePAInteriorFaces(const FiniteElementSpace &fes) override
static const IntegrationRule & GetRule(Geometry::Type geom, int order, const FaceElementTransformations &T)
const FaceGeometricFactors * geom
Not owned.
const DofToQuad * maps
Not owned.
static MemoryType GetMemoryType()
(DEPRECATED) Equivalent to GetDeviceMemoryType().
static MemoryType GetDeviceMemoryType()
Get the current Device MemoryType. This is the MemoryType used by most MFEM classes when allocating m...
@ TENSOR
Tensor product representation using 1D matrices/tensors with dimensions using 1D number of quadrature...
Array< real_t > B
Basis functions evaluated at quadrature points.
int ndof
Number of degrees of freedom = number of basis functions. When mode is TENSOR, this is the 1D number.
int nqpt
Number of quadrature points. When mode is TENSOR, this is the 1D number.
Array< real_t > Bt
Transpose of B.
Vector normal
Normals at all quadrature points.
Vector detJ
Determinants of the Jacobians at all quadrature points.
Class FiniteElementSpace - responsible for providing FEM view of the mesh, mainly managing the set of...
const IntegrationRule * IntRule
virtual const real_t * HostRead() const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), false).
virtual const real_t * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
virtual real_t * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(vec.GetMemory(), vec.Size(), on_dev).
void SetSize(int s)
Resize the vector to size s.
const T * Read(const Memory< T > &mem, int size, bool on_dev=true)
Get a pointer for read access to mem with the mfem::Device's DeviceMemoryClass, if on_dev = true,...
real_t u(const Vector &xvec)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
int ToLexOrdering(const int dim, const int face_id, const int size1d, const int index)
Convert a dof face index from Native ordering to lexicographic ordering for quads and hexes.
@ COMPRESSED
Enable all above compressions.
MemoryType
Memory types supported by MFEM.
std::function< real_t(const Vector &)> f(real_t mass_coeff)
void forall(int N, lambda &&body)
void vel(const Vector &x, real_t t, Vector &u)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.