21using mfem::kernels::internal::SetMaxOf;
31template<
int T_SDIM = 0,
int T_D1D = 0,
int T_Q1D = 0>
32void SmemPAVectorDiffusionApply2D(
const int NE,
34 const Array<real_t> &
b,
35 const Array<real_t> &g,
43 static constexpr int DIM = 2;
44 const int SDIM = T_SDIM ? T_SDIM : sdim;
45 const int D1D = T_D1D ? T_D1D : d1d;
46 const int Q1D = T_Q1D ? T_Q1D : q1d;
48 const int PA_SIZE =
DIM*
DIM;
49 const bool matrix_coeff = coeff_vdim ==
DIM*
DIM;
51 const auto B =
b.Read(), G = g.Read();
52 const auto DE =
Reshape(d.Read(), Q1D, Q1D, PA_SIZE,
53 SDIM * (matrix_coeff ?
SDIM : 1), NE);
54 const auto XE =
Reshape(x.Read(), D1D, D1D,
SDIM, NE);
55 auto YE =
Reshape(y.ReadWrite(), D1D, D1D,
SDIM, NE);
59 constexpr int MD1 = T_D1D > 0 ? SetMaxOf(T_D1D) :
DofQuadLimits::MAX_T1D;
60 constexpr int MQ1 = T_Q1D > 0 ? SetMaxOf(T_Q1D) :
DofQuadLimits::MAX_T1D;
62 MFEM_SHARED
real_t sB[MD1][MQ1], sG[MD1][MQ1], smem[MQ1][MQ1];
63 kernels::internal::vd_regs2d_t<3, DIM, MQ1> r0, r1;
64 kernels::internal::LoadMatrix(D1D, Q1D, B, sB);
65 kernels::internal::LoadMatrix(D1D, Q1D, G, sG);
67 for (
int i = 0; i <
SDIM; i++)
69 for (
int j = 0; j < (matrix_coeff ?
SDIM : 1); j++)
71 kernels::internal::LoadDofs2d(e, D1D, i, XE, r0);
72 kernels::internal::Grad2d(D1D, Q1D, smem, sB, sG, r0, r1, i);
73 MFEM_FOREACH_THREAD_DIRECT(qy, y, Q1D)
75 MFEM_FOREACH_THREAD_DIRECT(qx, x, Q1D)
77 const real_t gradX = r1[i][0][qy][qx];
78 const real_t gradY = r1[i][1][qy][qx];
79 const int k = matrix_coeff ? (j + i *
SDIM) : i;
80 const real_t O11 = DE(qx,qy,0,k,e), O12 = DE(qx,qy,1,k,e);
81 const real_t O21 = DE(qx,qy,2,k,e), O22 = DE(qx,qy,3,k,e);
82 r0[i][0][qy][qx] = (O11 * gradX) + (O12 * gradY);
83 r0[i][1][qy][qx] = (O21 * gradX) + (O22 * gradY);
87 kernels::internal::GradTranspose2d(D1D, Q1D, smem, sB, sG, r0, r1, i);
88 const int ij = matrix_coeff ? j : i;
89 kernels::internal::WriteDofs2d(e, D1D, i, ij, r1, YE);
95template<
int T_SDIM = 0,
int T_D1D = 0,
int T_Q1D = 0>
96void SmemPAVectorDiffusionApply3D(
const int NE,
98 const Array<real_t> &
b,
99 const Array<real_t> &g,
108 static constexpr int DIM = 3;
109 const int SDIM = T_SDIM ? T_SDIM : sdim;
110 MFEM_VERIFY(
SDIM == 3,
"SDIM must be 3");
111 const int D1D = T_D1D ? T_D1D : d1d;
112 const int Q1D = T_Q1D ? T_Q1D : q1d;
114 const int PA_SIZE =
DIM*
DIM;
115 const bool matrix_coeff = coeff_vdim ==
DIM*
DIM;
117 const auto B =
b.Read(), G = g.Read();
118 const auto DE =
Reshape(d.Read(), Q1D, Q1D, Q1D, PA_SIZE,
119 SDIM * (matrix_coeff ?
SDIM : 1), NE);
120 const auto XE =
Reshape(x.Read(), D1D, D1D, D1D,
SDIM, NE);
121 auto YE =
Reshape(y.ReadWrite(), D1D, D1D, D1D,
SDIM, NE);
125 constexpr int MD1 = T_D1D > 0 ? SetMaxOf(T_D1D) :
DofQuadLimits::MAX_T1D;
126 constexpr int MQ1 = T_Q1D > 0 ? SetMaxOf(T_Q1D) :
DofQuadLimits::MAX_T1D;
128 MFEM_SHARED
real_t sB[MD1][MQ1], sG[MD1][MQ1], smem[MQ1][MQ1];
129 kernels::internal::vd_regs3d_t<3, DIM, MQ1> r0, r1;
130 kernels::internal::LoadMatrix(D1D, Q1D, B, sB);
131 kernels::internal::LoadMatrix(D1D, Q1D, G, sG);
133 for (
int i = 0; i <
SDIM; i++)
135 for (
int j = 0; j < (matrix_coeff ?
SDIM : 1); j++)
137 kernels::internal::LoadDofs3d(e, D1D, i, XE, r0);
138 kernels::internal::Grad3d(D1D, Q1D, smem, sB, sG, r0, r1, i);
139 for (
int qz = 0; qz < Q1D; qz++)
141 MFEM_FOREACH_THREAD_DIRECT(qy, y, Q1D)
143 MFEM_FOREACH_THREAD_DIRECT(qx, x, Q1D)
145 const real_t gradX = r1[i][0][qz][qy][qx];
146 const real_t gradY = r1[i][1][qz][qy][qx];
147 const real_t gradZ = r1[i][2][qz][qy][qx];
148 const int k = matrix_coeff ? (j + i *
SDIM) : i;
149 const real_t O11 = DE(qx,qy,qz,0,k,e), O12 = DE(qx,qy,qz,1,k,e),
150 O13 = DE(qx,qy,qz,2,k,e);
151 const real_t O22 = DE(qx,qy,qz,3,k,e), O23 = DE(qx,qy,qz,4,k,e);
152 const real_t O33 = DE(qx,qy,qz,5,k,e);
153 r0[i][0][qz][qy][qx] = (O11*gradX)+(O12*gradY)+(O13*gradZ);
154 r0[i][1][qz][qy][qx] = (O12*gradX)+(O22*gradY)+(O23*gradZ);
155 r0[i][2][qz][qy][qx] = (O13*gradX)+(O23*gradY)+(O33*gradZ);
160 kernels::internal::GradTranspose3d(D1D, Q1D, smem, sB, sG, r0, r1, i);
161 const int ij = matrix_coeff ? j : i;
162 kernels::internal::WriteDofs3d(e, D1D, i, ij, r1, YE);
170template<
int DIM,
int T_SDIM,
int T_D1D,
int T_Q1D>
172VectorDiffusionIntegrator::ApplyPAKernels::Kernel()
176 return internal::SmemPAVectorDiffusionApply2D<T_SDIM, T_D1D, T_Q1D>;
180 return internal::SmemPAVectorDiffusionApply3D<T_SDIM, T_D1D, T_Q1D>;
182 else { MFEM_ABORT(
"Unsupported kernel"); }
186VectorDiffusionIntegrator::ApplyPAKernels::Fallback(
int dim,
int sdim,
191 return internal::SmemPAVectorDiffusionApply2D;
195 return internal::SmemPAVectorDiffusionApply3D;
197 else { MFEM_ABORT(
"Unsupported kernel"); }
void(*)(const int, const int, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int, const int) ApplyKernelType
arguments: ne, coeff_vdim, B, G, pa_data, x, y, d1d, q1d, vdim
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
internal::DofQuadLimits_CUDA DofQuadLimits
Maximum number of 1D DOFs or quadrature points for the architecture currently being compiled for (use...
void forall_2D(int N, int X, int Y, lambda &&body)