21using mfem::kernels::internal::SetMaxOf;
31template <
int T_D1D = 0,
int T_Q1D = 0>
32void SmemPAVectorMassApply2D(
const int NE,
34 const Array<real_t> &
b,
41 static constexpr int DIM = 2, VDIM = 2;
42 const int D1D = T_D1D ? T_D1D : d1d;
43 const int Q1D = T_Q1D ? T_Q1D : q1d;
45 const bool const_coeff = coeff_vdim == 1;
46 const bool vector_coeff = coeff_vdim ==
DIM;
47 const bool matrix_coeff = coeff_vdim ==
DIM*
DIM;
49 const auto B =
b.Read();
50 const auto D =
Reshape(d.Read(), Q1D, Q1D, coeff_vdim, NE);
51 const auto X =
Reshape(x.Read(), D1D, D1D, VDIM, NE);
52 auto Y =
Reshape(y.ReadWrite(), D1D, D1D, VDIM, NE);
56 constexpr int MD1 = T_D1D > 0 ? SetMaxOf(T_D1D) :
DofQuadLimits::MAX_T1D;
57 constexpr int MQ1 = T_Q1D > 0 ? SetMaxOf(T_Q1D) :
DofQuadLimits::MAX_T1D;
59 MFEM_SHARED
real_t sB[MD1][MQ1], smem[MQ1][MQ1];
60 kernels::internal::v_regs2d_t<VDIM, MQ1> r0, r1;
61 kernels::internal::LoadMatrix(D1D, Q1D, B, sB);
62 kernels::internal::LoadDofs2d(e, D1D, X, r0);
63 kernels::internal::Eval2d(D1D, Q1D, smem, sB, r0, r1);
65 MFEM_FOREACH_THREAD_DIRECT(qy, y, Q1D)
67 MFEM_FOREACH_THREAD_DIRECT(qx, x, Q1D)
69 const real_t Qx = r1[0][qy][qx];
70 const real_t Qy = r1[1][qy][qx];
71 const real_t D0 = D(qx, qy, 0, e);
75 r0[0][qy][qx] = D0 * Qx;
76 r0[1][qy][qx] = D0 * Qy;
80 const real_t D1 = D(qx, qy, 1, e);
81 r0[0][qy][qx] = D0 * Qx;
82 r0[1][qy][qx] = D1 * Qy;
86 const real_t D1 = D(qx, qy, 1, e);
87 const real_t D2 = D(qx, qy, 2, e);
88 const real_t D3 = D(qx, qy, 3, e);
89 r0[0][qy][qx] = D0 * Qx + D1 * Qy;
90 r0[1][qy][qx] = D2 * Qx + D3 * Qy;
94 kernels::internal::EvalTranspose2d(D1D, Q1D, smem, sB, r0, r1);
95 kernels::internal::WriteDofs2d(e, D1D, r1, Y);
99template <
int T_D1D = 0,
int T_Q1D = 0>
100void SmemPAVectorMassApply3D(
const int NE,
101 const int coeff_vdim,
102 const Array<real_t> &
b,
109 static constexpr int VDIM = 3;
110 const int D1D = T_D1D ? T_D1D : d1d;
111 const int Q1D = T_Q1D ? T_Q1D : q1d;
113 const bool const_coeff = coeff_vdim == 1;
114 const bool vector_coeff = coeff_vdim == VDIM;
115 const bool matrix_coeff = coeff_vdim == VDIM*VDIM;
117 const auto B =
b.Read();
118 const auto D =
Reshape(d.Read(), Q1D, Q1D, Q1D, coeff_vdim, NE);
119 const auto X =
Reshape(x.Read(), D1D, D1D, D1D, VDIM, NE);
120 auto Y =
Reshape(y.ReadWrite(), D1D, D1D, D1D, VDIM, NE);
124 constexpr int MD1 = T_D1D > 0 ? SetMaxOf(T_D1D) :
DofQuadLimits::MAX_T1D;
125 constexpr int MQ1 = T_Q1D > 0 ? SetMaxOf(T_Q1D) :
DofQuadLimits::MAX_T1D;
127 MFEM_SHARED
real_t sB[MD1][MQ1], smem[MQ1][MQ1];
128 kernels::internal::v_regs3d_t<VDIM, MQ1> r0, r1;
129 kernels::internal::LoadMatrix(D1D, Q1D, B, sB);
130 kernels::internal::LoadDofs3d(e, D1D, X, r0);
131 kernels::internal::Eval3d(D1D, Q1D, smem, sB, r0, r1);
133 for (
int qz = 0; qz < Q1D; qz++)
135 MFEM_FOREACH_THREAD_DIRECT(qy, y, Q1D)
137 MFEM_FOREACH_THREAD_DIRECT(qx, x, Q1D)
139 const real_t Qx = r1[0][qz][qy][qx];
140 const real_t Qy = r1[1][qz][qy][qx];
141 const real_t Qz = r1[2][qz][qy][qx];
142 const real_t D0 = D(qx, qy, qz, 0, e);
145 r0[0][qz][qy][qx] = D0 * Qx;
146 r0[1][qz][qy][qx] = D0 * Qy;
147 r0[2][qz][qy][qx] = D0 * Qz;
151 const real_t D1 = D(qx, qy, qz, 1, e);
152 const real_t D2 = D(qx, qy, qz, 2, e);
153 r0[0][qz][qy][qx] = D0 * Qx;
154 r0[1][qz][qy][qx] = D1 * Qy;
155 r0[2][qz][qy][qx] = D2 * Qz;
159 const real_t D1 = D(qx, qy, qz, 1, e);
160 const real_t D2 = D(qx, qy, qz, 2, e);
161 const real_t D3 = D(qx, qy, qz, 3, e);
162 const real_t D4 = D(qx, qy, qz, 4, e);
163 const real_t D5 = D(qx, qy, qz, 5, e);
164 const real_t D6 = D(qx, qy, qz, 6, e);
165 const real_t D7 = D(qx, qy, qz, 7, e);
166 const real_t D8 = D(qx, qy, qz, 8, e);
167 r0[0][qz][qy][qx] = D0 * Qx + D1 * Qy + D2 * Qz;
168 r0[1][qz][qy][qx] = D3 * Qx + D4 * Qy + D5 * Qz;
169 r0[2][qz][qy][qx] = D6 * Qx + D7 * Qy + D8 * Qz;
174 kernels::internal::EvalTranspose3d(D1D, Q1D, smem, sB, r0, r1);
175 kernels::internal::WriteDofs3d(e, D1D, r1, Y);
181template<
int DIM,
int T_D1D,
int T_Q1D>
183VectorMassIntegrator::VectorMassAddMultPA::Kernel()
187 return internal::SmemPAVectorMassApply2D<T_D1D,T_Q1D>;
191 return internal::SmemPAVectorMassApply3D<T_D1D, T_Q1D>;
193 else { MFEM_ABORT(
"Unsupported kernel"); }
197VectorMassIntegrator::VectorMassAddMultPA::Fallback(
int dim,
int d1d,
int q1d)
201 return internal::SmemPAVectorMassApply2D;
205 return internal::SmemPAVectorMassApply3D;
207 else { MFEM_ABORT(
"Unsupported kernel"); }
void(*)(const int, const int, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) VectorMassAddMultPAType
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
internal::DofQuadLimits_CUDA DofQuadLimits
Maximum number of 1D DOFs or quadrature points for the architecture currently being compiled for (use...
void forall_2D(int N, int X, int Y, lambda &&body)