12 #ifndef MFEM_BILININTEG_MASS_PA_HPP
13 #define MFEM_BILININTEG_MASS_PA_HPP
15 #include "../config/config.hpp"
16 #include "../general/forall.hpp"
17 #include "../linalg/dtensor.hpp"
25 template <
bool ACCUMULATE = true>
26 MFEM_HOST_DEVICE
inline
27 void PAMassApply2D_Element(
const int e,
47 for (
int dy = 0; dy < D1D; ++dy)
49 for (
int dx = 0; dx < D1D; ++dx)
56 constexpr
int max_D1D =
MAX_D1D;
57 constexpr
int max_Q1D =
MAX_Q1D;
58 double sol_xy[max_Q1D][max_Q1D];
59 for (
int qy = 0; qy < Q1D; ++qy)
61 for (
int qx = 0; qx < Q1D; ++qx)
66 for (
int dy = 0; dy < D1D; ++dy)
68 double sol_x[max_Q1D];
69 for (
int qy = 0; qy < Q1D; ++qy)
73 for (
int dx = 0; dx < D1D; ++dx)
75 const double s = X(dx,dy,e);
76 for (
int qx = 0; qx < Q1D; ++qx)
78 sol_x[qx] += B(qx,dx)*
s;
81 for (
int qy = 0; qy < Q1D; ++qy)
83 const double d2q = B(qy,dy);
84 for (
int qx = 0; qx < Q1D; ++qx)
86 sol_xy[qy][qx] += d2q * sol_x[qx];
90 for (
int qy = 0; qy < Q1D; ++qy)
92 for (
int qx = 0; qx < Q1D; ++qx)
94 sol_xy[qy][qx] *= D(qx,qy,e);
97 for (
int qy = 0; qy < Q1D; ++qy)
99 double sol_x[max_D1D];
100 for (
int dx = 0; dx < D1D; ++dx)
104 for (
int qx = 0; qx < Q1D; ++qx)
106 const double s = sol_xy[qy][qx];
107 for (
int dx = 0; dx < D1D; ++dx)
109 sol_x[dx] += Bt(dx,qx) *
s;
112 for (
int dy = 0; dy < D1D; ++dy)
114 const double q2d = Bt(dy,qy);
115 for (
int dx = 0; dx < D1D; ++dx)
117 Y(dx,dy,e) += q2d * sol_x[dx];
123 template<
int T_D1D,
int T_Q1D,
int T_NBZ,
bool ACCUMULATE = true>
124 MFEM_HOST_DEVICE
inline
125 void SmemPAMassApply2D_Element(
const int e,
134 const int D1D = T_D1D ? T_D1D : d1d;
135 const int Q1D = T_Q1D ? T_Q1D : q1d;
136 constexpr
int NBZ = T_NBZ ? T_NBZ : 1;
138 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
139 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
140 constexpr
int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
147 const int tidz = MFEM_THREAD_ID(z);
149 MFEM_SHARED
double BBt[MQ1*MD1];
150 double (*B)[MD1] = (double (*)[MD1]) BBt;
151 double (*Bt)[MQ1] = (double (*)[MQ1]) BBt;
152 MFEM_SHARED
double sm0[NBZ][MDQ*MDQ];
153 MFEM_SHARED
double sm1[NBZ][MDQ*MDQ];
154 double (*X)[MD1] = (double (*)[MD1]) (sm0 + tidz);
155 double (*DQ)[MQ1] = (double (*)[MQ1]) (sm1 + tidz);
156 double (*QQ)[MQ1] = (double (*)[MQ1]) (sm0 + tidz);
157 double (*QD)[MD1] = (double (*)[MD1]) (sm1 + tidz);
160 MFEM_FOREACH_THREAD(dy,y,D1D)
162 MFEM_FOREACH_THREAD(dx,x,D1D)
164 X[dy][dx] = x(dx,dy,e);
169 MFEM_FOREACH_THREAD(dy,y,D1D)
171 MFEM_FOREACH_THREAD(q,x,Q1D)
178 MFEM_FOREACH_THREAD(dy,y,D1D)
180 MFEM_FOREACH_THREAD(qx,x,Q1D)
183 for (
int dx = 0; dx < D1D; ++dx)
185 dq += X[dy][dx] * B[qx][dx];
191 MFEM_FOREACH_THREAD(qy,y,Q1D)
193 MFEM_FOREACH_THREAD(qx,x,Q1D)
196 for (
int dy = 0; dy < D1D; ++dy)
198 qq += DQ[dy][qx] * B[qy][dy];
200 QQ[qy][qx] = qq * D(qx, qy, e);
206 MFEM_FOREACH_THREAD(dy,y,D1D)
208 MFEM_FOREACH_THREAD(q,x,Q1D)
215 MFEM_FOREACH_THREAD(qy,y,Q1D)
217 MFEM_FOREACH_THREAD(dx,x,D1D)
220 for (
int qx = 0; qx < Q1D; ++qx)
222 dq += QQ[qy][qx] * Bt[dx][qx];
228 MFEM_FOREACH_THREAD(dy,y,D1D)
230 MFEM_FOREACH_THREAD(dx,x,D1D)
233 for (
int qy = 0; qy < Q1D; ++qy)
235 dd += (QD[qy][dx] * Bt[dy][qy]);
249 template <
bool ACCUMULATE = true>
250 MFEM_HOST_DEVICE
inline
251 void PAMassApply3D_Element(
const int e,
265 auto D = DeviceTensor<4,const double>(d_, Q1D, Q1D, Q1D, NE);
266 auto X = DeviceTensor<4,const double>(x_, D1D, D1D, D1D, NE);
267 auto Y = DeviceTensor<4,double>(y_, D1D, D1D, D1D, NE);
271 for (
int dz = 0; dz < D1D; ++dz)
273 for (
int dy = 0; dy < D1D; ++dy)
275 for (
int dx = 0; dx < D1D; ++dx)
277 Y(dx, dy, dz, e) = 0.0;
283 constexpr
int max_D1D =
MAX_D1D;
284 constexpr
int max_Q1D =
MAX_Q1D;
285 double sol_xyz[max_Q1D][max_Q1D][max_Q1D];
286 for (
int qz = 0; qz < Q1D; ++qz)
288 for (
int qy = 0; qy < Q1D; ++qy)
290 for (
int qx = 0; qx < Q1D; ++qx)
292 sol_xyz[qz][qy][qx] = 0.0;
296 for (
int dz = 0; dz < D1D; ++dz)
298 double sol_xy[max_Q1D][max_Q1D];
299 for (
int qy = 0; qy < Q1D; ++qy)
301 for (
int qx = 0; qx < Q1D; ++qx)
303 sol_xy[qy][qx] = 0.0;
306 for (
int dy = 0; dy < D1D; ++dy)
308 double sol_x[max_Q1D];
309 for (
int qx = 0; qx < Q1D; ++qx)
313 for (
int dx = 0; dx < D1D; ++dx)
315 const double s = X(dx,dy,dz,e);
316 for (
int qx = 0; qx < Q1D; ++qx)
318 sol_x[qx] += B(qx,dx) *
s;
321 for (
int qy = 0; qy < Q1D; ++qy)
323 const double wy = B(qy,dy);
324 for (
int qx = 0; qx < Q1D; ++qx)
326 sol_xy[qy][qx] += wy * sol_x[qx];
330 for (
int qz = 0; qz < Q1D; ++qz)
332 const double wz = B(qz,dz);
333 for (
int qy = 0; qy < Q1D; ++qy)
335 for (
int qx = 0; qx < Q1D; ++qx)
337 sol_xyz[qz][qy][qx] += wz * sol_xy[qy][qx];
342 for (
int qz = 0; qz < Q1D; ++qz)
344 for (
int qy = 0; qy < Q1D; ++qy)
346 for (
int qx = 0; qx < Q1D; ++qx)
348 sol_xyz[qz][qy][qx] *= D(qx,qy,qz,e);
352 for (
int qz = 0; qz < Q1D; ++qz)
354 double sol_xy[max_D1D][max_D1D];
355 for (
int dy = 0; dy < D1D; ++dy)
357 for (
int dx = 0; dx < D1D; ++dx)
362 for (
int qy = 0; qy < Q1D; ++qy)
364 double sol_x[max_D1D];
365 for (
int dx = 0; dx < D1D; ++dx)
369 for (
int qx = 0; qx < Q1D; ++qx)
371 const double s = sol_xyz[qz][qy][qx];
372 for (
int dx = 0; dx < D1D; ++dx)
374 sol_x[dx] += Bt(dx,qx) *
s;
377 for (
int dy = 0; dy < D1D; ++dy)
379 const double wy = Bt(dy,qy);
380 for (
int dx = 0; dx < D1D; ++dx)
382 sol_xy[dy][dx] += wy * sol_x[dx];
386 for (
int dz = 0; dz < D1D; ++dz)
388 const double wz = Bt(dz,qz);
389 for (
int dy = 0; dy < D1D; ++dy)
391 for (
int dx = 0; dx < D1D; ++dx)
393 Y(dx,dy,dz,e) += wz * sol_xy[dy][dx];
400 template<
int T_D1D,
int T_Q1D,
bool ACCUMULATE = true>
401 MFEM_HOST_DEVICE
inline
402 void SmemPAMassApply3D_Element(
const int e,
411 constexpr
int D1D = T_D1D ? T_D1D : d1d;
412 constexpr
int Q1D = T_Q1D ? T_Q1D : q1d;
413 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
414 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
415 constexpr
int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
418 auto d = DeviceTensor<4,const double>(d_, Q1D, Q1D, Q1D, NE);
419 auto x = DeviceTensor<4,const double>(x_, D1D, D1D, D1D, NE);
420 auto y = DeviceTensor<4,double>(y_, D1D, D1D, D1D, NE);
422 MFEM_SHARED
double sDQ[MQ1*MD1];
423 double (*B)[MD1] = (double (*)[MD1]) sDQ;
424 double (*Bt)[MQ1] = (double (*)[MQ1]) sDQ;
425 MFEM_SHARED
double sm0[MDQ*MDQ*MDQ];
426 MFEM_SHARED
double sm1[MDQ*MDQ*MDQ];
427 double (*X)[MD1][MD1] = (double (*)[MD1][MD1]) sm0;
428 double (*DDQ)[MD1][MQ1] = (double (*)[MD1][MQ1]) sm1;
429 double (*DQQ)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) sm0;
430 double (*QQQ)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) sm1;
431 double (*QQD)[MQ1][MD1] = (double (*)[MQ1][MD1]) sm0;
432 double (*QDD)[MD1][MD1] = (double (*)[MD1][MD1]) sm1;
433 MFEM_FOREACH_THREAD(dy,y,D1D)
435 MFEM_FOREACH_THREAD(dx,x,D1D)
438 for (
int dz = 0; dz < D1D; ++dz)
440 X[dz][dy][dx] = x(dx,dy,dz,e);
443 MFEM_FOREACH_THREAD(dx,x,Q1D)
445 B[dx][dy] =
b(dx,dy);
449 MFEM_FOREACH_THREAD(dy,y,D1D)
451 MFEM_FOREACH_THREAD(qx,x,Q1D)
455 for (
int dz = 0; dz < D1D; dz++)
460 for (
int dx = 0; dx < D1D; ++dx)
463 for (
int dz = 0; dz < D1D; ++dz)
465 u[dz] += X[dz][dy][dx] * B[qx][dx];
469 for (
int dz = 0; dz < D1D; ++dz)
471 DDQ[dz][dy][qx] = u[dz];
476 MFEM_FOREACH_THREAD(qy,y,Q1D)
478 MFEM_FOREACH_THREAD(qx,x,Q1D)
482 for (
int dz = 0; dz < D1D; dz++)
487 for (
int dy = 0; dy < D1D; ++dy)
490 for (
int dz = 0; dz < D1D; dz++)
492 u[dz] += DDQ[dz][dy][qx] * B[qy][dy];
496 for (
int dz = 0; dz < D1D; dz++)
498 DQQ[dz][qy][qx] = u[dz];
503 MFEM_FOREACH_THREAD(qy,y,Q1D)
505 MFEM_FOREACH_THREAD(qx,x,Q1D)
509 for (
int qz = 0; qz < Q1D; qz++)
514 for (
int dz = 0; dz < D1D; ++dz)
517 for (
int qz = 0; qz < Q1D; qz++)
519 u[qz] += DQQ[dz][qy][qx] * B[qz][dz];
523 for (
int qz = 0; qz < Q1D; qz++)
525 QQQ[qz][qy][qx] = u[qz] * d(qx,qy,qz,e);
530 MFEM_FOREACH_THREAD(di,y,D1D)
532 MFEM_FOREACH_THREAD(q,x,Q1D)
538 MFEM_FOREACH_THREAD(qy,y,Q1D)
540 MFEM_FOREACH_THREAD(dx,x,D1D)
544 for (
int qz = 0; qz < Q1D; ++qz)
549 for (
int qx = 0; qx < Q1D; ++qx)
552 for (
int qz = 0; qz < Q1D; ++qz)
554 u[qz] += QQQ[qz][qy][qx] * Bt[dx][qx];
558 for (
int qz = 0; qz < Q1D; ++qz)
560 QQD[qz][qy][dx] = u[qz];
565 MFEM_FOREACH_THREAD(dy,y,D1D)
567 MFEM_FOREACH_THREAD(dx,x,D1D)
571 for (
int qz = 0; qz < Q1D; ++qz)
576 for (
int qy = 0; qy < Q1D; ++qy)
579 for (
int qz = 0; qz < Q1D; ++qz)
581 u[qz] += QQD[qz][qy][dx] * Bt[dy][qy];
585 for (
int qz = 0; qz < Q1D; ++qz)
587 QDD[qz][dy][dx] = u[qz];
592 MFEM_FOREACH_THREAD(dy,y,D1D)
594 MFEM_FOREACH_THREAD(dx,x,D1D)
598 for (
int dz = 0; dz < D1D; ++dz)
603 for (
int qz = 0; qz < Q1D; ++qz)
606 for (
int dz = 0; dz < D1D; ++dz)
608 u[dz] += QDD[qz][dy][dx] * Bt[dz][qz];
612 for (
int dz = 0; dz < D1D; ++dz)
616 y(dx,dy,dz,e) += u[dz];
620 y(dx,dy,dz,e) = u[dz];
DeviceTensor< 2, const double > ConstDeviceMatrix
DeviceTensor< 3, double > DeviceCube
double u(const Vector &xvec)
DeviceTensor< 3, const double > ConstDeviceCube