12#ifndef MFEM_BILININTEG_HCURL_KERNELS_HPP
13#define MFEM_BILININTEG_HCURL_KERNELS_HPP
32void PAHcurlMassAssembleDiagonal2D(
const int D1D,
36 const Array<real_t> &bo,
37 const Array<real_t> &bc,
38 const Vector &pa_data,
42void PAHcurlMassAssembleDiagonal3D(
const int D1D,
46 const Array<real_t> &bo,
47 const Array<real_t> &bc,
48 const Vector &pa_data,
52template<
int T_D1D = 0,
int T_Q1D = 0>
53inline void SmemPAHcurlMassAssembleDiagonal3D(
const int d1d,
57 const Array<real_t> &bo,
58 const Array<real_t> &bc,
59 const Vector &pa_data,
63 "Error: d1d > HCURL_MAX_D1D");
65 "Error: q1d > HCURL_MAX_Q1D");
66 const int D1D = T_D1D ? T_D1D : d1d;
67 const int Q1D = T_Q1D ? T_Q1D : q1d;
69 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
70 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
71 auto op =
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
72 auto D =
Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
76 constexpr int VDIM = 3;
77 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
78 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
79 const int D1D = T_D1D ? T_D1D : d1d;
80 const int Q1D = T_Q1D ? T_Q1D : q1d;
82 MFEM_SHARED
real_t sBo[MQ1D][MD1D];
83 MFEM_SHARED
real_t sBc[MQ1D][MD1D];
86 MFEM_SHARED
real_t sop[3][MQ1D][MQ1D];
88 MFEM_FOREACH_THREAD(qx,x,Q1D)
90 MFEM_FOREACH_THREAD(qy,y,Q1D)
92 MFEM_FOREACH_THREAD(qz,z,Q1D)
94 op3[0] = op(qx,qy,qz,0,e);
95 op3[1] = op(qx,qy,qz,symmetric ? 3 : 4,e);
96 op3[2] = op(qx,qy,qz,symmetric ? 5 : 8,e);
101 const int tidx = MFEM_THREAD_ID(x);
102 const int tidy = MFEM_THREAD_ID(y);
103 const int tidz = MFEM_THREAD_ID(z);
107 MFEM_FOREACH_THREAD(d,y,D1D)
109 MFEM_FOREACH_THREAD(q,x,Q1D)
122 for (
int c = 0; c < VDIM; ++c)
124 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
125 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
126 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
130 for (
int qz=0; qz < Q1D; ++qz)
134 for (
int i=0; i<3; ++i)
136 sop[i][tidx][tidy] = op3[i];
142 MFEM_FOREACH_THREAD(dz,z,D1Dz)
144 const real_t wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
146 MFEM_FOREACH_THREAD(dy,y,D1Dy)
148 MFEM_FOREACH_THREAD(dx,x,D1Dx)
150 for (
int qy = 0; qy < Q1D; ++qy)
152 const real_t wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
154 for (
int qx = 0; qx < Q1D; ++qx)
156 const real_t wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
157 dxyz += sop[c][qx][qy] * wx * wx * wy * wy * wz * wz;
167 MFEM_FOREACH_THREAD(dz,z,D1Dz)
169 MFEM_FOREACH_THREAD(dy,y,D1Dy)
171 MFEM_FOREACH_THREAD(dx,x,D1Dx)
173 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
178 osc += D1Dx * D1Dy * D1Dz;
184void PAHcurlMassApply2D(
const int D1D,
187 const bool symmetric,
188 const Array<real_t> &bo,
189 const Array<real_t> &bc,
190 const Array<real_t> &bot,
191 const Array<real_t> &bct,
192 const Vector &pa_data,
197void PAHcurlMassApply3D(
const int D1D,
200 const bool symmetric,
201 const Array<real_t> &bo,
202 const Array<real_t> &bc,
203 const Array<real_t> &bot,
204 const Array<real_t> &bct,
205 const Vector &pa_data,
210template<
int T_D1D = 0,
int T_Q1D = 0>
211inline void SmemPAHcurlMassApply3D(
const int d1d,
214 const bool symmetric,
215 const Array<real_t> &bo,
216 const Array<real_t> &bc,
217 const Array<real_t> &bot,
218 const Array<real_t> &bct,
219 const Vector &pa_data,
224 "Error: d1d > HCURL_MAX_D1D");
226 "Error: q1d > HCURL_MAX_Q1D");
227 const int D1D = T_D1D ? T_D1D : d1d;
228 const int Q1D = T_Q1D ? T_Q1D : q1d;
230 const int dataSize = symmetric ? 6 : 9;
232 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
233 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
234 auto op =
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, dataSize, NE);
235 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
236 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
240 constexpr int VDIM = 3;
241 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
242 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
243 const int D1D = T_D1D ? T_D1D : d1d;
244 const int Q1D = T_Q1D ? T_Q1D : q1d;
246 MFEM_SHARED
real_t sBo[MQ1D][MD1D];
247 MFEM_SHARED
real_t sBc[MQ1D][MD1D];
250 MFEM_SHARED
real_t sop[9*MQ1D*MQ1D];
251 MFEM_SHARED
real_t mass[MQ1D][MQ1D][3];
253 MFEM_SHARED
real_t sX[MD1D][MD1D][MD1D];
255 MFEM_FOREACH_THREAD(qx,x,Q1D)
257 MFEM_FOREACH_THREAD(qy,y,Q1D)
259 MFEM_FOREACH_THREAD(qz,z,Q1D)
261 for (
int i=0; i<dataSize; ++i)
263 op9[i] = op(qx,qy,qz,i,e);
269 const int tidx = MFEM_THREAD_ID(x);
270 const int tidy = MFEM_THREAD_ID(y);
271 const int tidz = MFEM_THREAD_ID(z);
275 MFEM_FOREACH_THREAD(d,y,D1D)
277 MFEM_FOREACH_THREAD(q,x,Q1D)
289 for (
int qz=0; qz < Q1D; ++qz)
292 for (
int c = 0; c < VDIM; ++c)
294 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
295 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
296 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
298 MFEM_FOREACH_THREAD(dz,z,D1Dz)
300 MFEM_FOREACH_THREAD(dy,y,D1Dy)
302 MFEM_FOREACH_THREAD(dx,x,D1Dx)
304 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
312 for (
int i=0; i<dataSize; ++i)
314 sop[i + (dataSize*tidx) + (dataSize*Q1D*tidy)] = op9[i];
317 MFEM_FOREACH_THREAD(qy,y,Q1D)
319 MFEM_FOREACH_THREAD(qx,x,Q1D)
323 for (
int dz = 0; dz < D1Dz; ++dz)
325 const real_t wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
326 for (
int dy = 0; dy < D1Dy; ++dy)
328 const real_t wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
329 for (
int dx = 0; dx < D1Dx; ++dx)
331 const real_t t = sX[dz][dy][dx];
332 const real_t wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
333 u += t * wx * wy * wz;
343 osc += D1Dx * D1Dy * D1Dz;
350 for (
int c = 0; c < VDIM; ++c)
352 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
353 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
354 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
358 MFEM_FOREACH_THREAD(dz,z,D1Dz)
360 const real_t wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
362 MFEM_FOREACH_THREAD(dy,y,D1Dy)
364 MFEM_FOREACH_THREAD(dx,x,D1Dx)
366 for (
int qy = 0; qy < Q1D; ++qy)
368 const real_t wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
369 for (
int qx = 0; qx < Q1D; ++qx)
371 const int os = (dataSize*qx) + (dataSize*Q1D*qy);
372 const int id1 = os + ((c == 0) ? 0 : ((c == 1) ? (symmetric ? 1 : 3) :
373 (symmetric ? 2 : 6)));
374 const int id2 = os + ((c == 0) ? 1 : ((c == 1) ? (symmetric ? 3 : 4) :
375 (symmetric ? 4 : 7)));
376 const int id3 = os + ((c == 0) ? 2 : ((c == 1) ? (symmetric ? 4 : 5) :
377 (symmetric ? 5 : 8)));
379 const real_t m_c = (sop[id1] * mass[qy][qx][0]) + (sop[id2] * mass[qy][qx][1]) +
380 (sop[id3] * mass[qy][qx][2]);
382 const real_t wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
383 dxyz += m_c * wx * wy * wz;
392 MFEM_FOREACH_THREAD(dz,z,D1Dz)
394 MFEM_FOREACH_THREAD(dy,y,D1Dy)
396 MFEM_FOREACH_THREAD(dx,x,D1Dx)
398 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
403 osc += D1Dx * D1Dy * D1Dz;
410void PACurlCurlSetup2D(
const int Q1D,
412 const Array<real_t> &w,
418void PACurlCurlSetup3D(
const int Q1D,
421 const Array<real_t> &w,
427void PACurlCurlAssembleDiagonal2D(
const int D1D,
430 const Array<real_t> &bo,
431 const Array<real_t> &gc,
432 const Vector &pa_data,
436template<
int T_D1D = 0,
int T_Q1D = 0>
437inline void PACurlCurlAssembleDiagonal3D(
const int d1d,
439 const bool symmetric,
441 const Array<real_t> &bo,
442 const Array<real_t> &bc,
443 const Array<real_t> &go,
444 const Array<real_t> &gc,
445 const Vector &pa_data,
449 "Error: d1d > HCURL_MAX_D1D");
451 "Error: q1d > HCURL_MAX_Q1D");
452 const int D1D = T_D1D ? T_D1D : d1d;
453 const int Q1D = T_Q1D ? T_Q1D : q1d;
455 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
456 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
457 auto Go =
Reshape(go.Read(), Q1D, D1D-1);
458 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
459 auto op =
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
460 auto D =
Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
462 const int s = symmetric ? 6 : 9;
466 const int i21 = symmetric ? i12 : 3;
467 const int i22 = symmetric ? 3 : 4;
468 const int i23 = symmetric ? 4 : 5;
469 const int i31 = symmetric ? i13 : 6;
470 const int i32 = symmetric ? i23 : 7;
471 const int i33 = symmetric ? 5 : 8;
484 constexpr int VDIM = 3;
485 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
486 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
487 const int D1D = T_D1D ? T_D1D : d1d;
488 const int Q1D = T_Q1D ? T_Q1D : q1d;
492 for (
int c = 0; c < VDIM; ++c)
494 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
495 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
496 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
498 real_t zt[MQ1D][MQ1D][MD1D][9][3];
501 for (
int qx = 0; qx < Q1D; ++qx)
503 for (
int qy = 0; qy < Q1D; ++qy)
505 for (
int dz = 0; dz < D1Dz; ++dz)
507 for (
int i=0; i<s; ++i)
509 for (
int d=0; d<3; ++d)
511 zt[qx][qy][dz][i][d] = 0.0;
515 for (
int qz = 0; qz < Q1D; ++qz)
517 const real_t wz = ((c == 2) ? Bo(qz,dz) : Bc(qz,dz));
518 const real_t wDz = ((c == 2) ? Go(qz,dz) : Gc(qz,dz));
520 for (
int i=0; i<s; ++i)
522 zt[qx][qy][dz][i][0] += wz * wz * op(qx,qy,qz,i,e);
523 zt[qx][qy][dz][i][1] += wDz * wz * op(qx,qy,qz,i,e);
524 zt[qx][qy][dz][i][2] += wDz * wDz * op(qx,qy,qz,i,e);
531 real_t yt[MQ1D][MD1D][MD1D][9][3][3];
534 for (
int qx = 0; qx < Q1D; ++qx)
536 for (
int dz = 0; dz < D1Dz; ++dz)
538 for (
int dy = 0; dy < D1Dy; ++dy)
540 for (
int i=0; i<s; ++i)
542 for (
int d=0; d<3; ++d)
543 for (
int j=0; j<3; ++j)
545 yt[qx][dy][dz][i][d][j] = 0.0;
549 for (
int qy = 0; qy < Q1D; ++qy)
551 const real_t wy = ((c == 1) ? Bo(qy,dy) : Bc(qy,dy));
552 const real_t wDy = ((c == 1) ? Go(qy,dy) : Gc(qy,dy));
554 for (
int i=0; i<s; ++i)
556 for (
int d=0; d<3; ++d)
558 yt[qx][dy][dz][i][d][0] += wy * wy * zt[qx][qy][dz][i][d];
559 yt[qx][dy][dz][i][d][1] += wDy * wy * zt[qx][qy][dz][i][d];
560 yt[qx][dy][dz][i][d][2] += wDy * wDy * zt[qx][qy][dz][i][d];
569 for (
int dz = 0; dz < D1Dz; ++dz)
571 for (
int dy = 0; dy < D1Dy; ++dy)
573 for (
int dx = 0; dx < D1Dx; ++dx)
575 for (
int qx = 0; qx < Q1D; ++qx)
577 const real_t wx = ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
578 const real_t wDx = ((c == 0) ? Go(qx,dx) : Gc(qx,dx));
598 const real_t sumy = yt[qx][dy][dz][i22][2][0] - yt[qx][dy][dz][i23][1][1]
599 - yt[qx][dy][dz][i32][1][1] + yt[qx][dy][dz][i33][0][2];
601 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += sumy * wx * wx;
606 const real_t d = (yt[qx][dy][dz][i11][2][0] * wx * wx)
607 - ((yt[qx][dy][dz][i13][1][0] + yt[qx][dy][dz][i31][1][0]) * wDx * wx)
608 + (yt[qx][dy][dz][i33][0][0] * wDx * wDx);
610 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
615 const real_t d = (yt[qx][dy][dz][i11][0][2] * wx * wx)
616 - ((yt[qx][dy][dz][i12][0][1] + yt[qx][dy][dz][i21][0][1]) * wDx * wx)
617 + (yt[qx][dy][dz][i22][0][0] * wDx * wDx);
619 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
626 osc += D1Dx * D1Dy * D1Dz;
632template<
int T_D1D = 0,
int T_Q1D = 0>
633inline void SmemPACurlCurlAssembleDiagonal3D(
const int d1d,
635 const bool symmetric,
637 const Array<real_t> &bo,
638 const Array<real_t> &bc,
639 const Array<real_t> &go,
640 const Array<real_t> &gc,
641 const Vector &pa_data,
645 "Error: d1d > HCURL_MAX_D1D");
647 "Error: q1d > HCURL_MAX_Q1D");
648 const int D1D = T_D1D ? T_D1D : d1d;
649 const int Q1D = T_Q1D ? T_Q1D : q1d;
651 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
652 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
653 auto Go =
Reshape(go.Read(), Q1D, D1D-1);
654 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
655 auto op =
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
656 auto D =
Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
658 const int s = symmetric ? 6 : 9;
662 const int i21 = symmetric ? i12 : 3;
663 const int i22 = symmetric ? 3 : 4;
664 const int i23 = symmetric ? 4 : 5;
665 const int i31 = symmetric ? i13 : 6;
666 const int i32 = symmetric ? i23 : 7;
667 const int i33 = symmetric ? 5 : 8;
677 constexpr int VDIM = 3;
678 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
679 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
680 const int D1D = T_D1D ? T_D1D : d1d;
681 const int Q1D = T_Q1D ? T_Q1D : q1d;
683 MFEM_SHARED
real_t sBo[MQ1D][MD1D];
684 MFEM_SHARED
real_t sBc[MQ1D][MD1D];
685 MFEM_SHARED
real_t sGo[MQ1D][MD1D];
686 MFEM_SHARED
real_t sGc[MQ1D][MD1D];
689 MFEM_SHARED
real_t sop[9][MQ1D][MQ1D];
691 MFEM_FOREACH_THREAD(qx,x,Q1D)
693 MFEM_FOREACH_THREAD(qy,y,Q1D)
695 MFEM_FOREACH_THREAD(qz,z,Q1D)
697 for (
int i=0; i<s; ++i)
699 ope[i] = op(qx,qy,qz,i,e);
705 const int tidx = MFEM_THREAD_ID(x);
706 const int tidy = MFEM_THREAD_ID(y);
707 const int tidz = MFEM_THREAD_ID(z);
711 MFEM_FOREACH_THREAD(d,y,D1D)
713 MFEM_FOREACH_THREAD(q,x,Q1D)
728 for (
int c = 0; c < VDIM; ++c)
730 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
731 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
732 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
736 for (
int qz=0; qz < Q1D; ++qz)
740 for (
int i=0; i<s; ++i)
742 sop[i][tidx][tidy] = ope[i];
748 MFEM_FOREACH_THREAD(dz,z,D1Dz)
750 const real_t wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
751 const real_t wDz = ((c == 2) ? sGo[qz][dz] : sGc[qz][dz]);
753 MFEM_FOREACH_THREAD(dy,y,D1Dy)
755 MFEM_FOREACH_THREAD(dx,x,D1Dx)
757 for (
int qy = 0; qy < Q1D; ++qy)
759 const real_t wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
760 const real_t wDy = ((c == 1) ? sGo[qy][dy] : sGc[qy][dy]);
762 for (
int qx = 0; qx < Q1D; ++qx)
764 const real_t wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
765 const real_t wDx = ((c == 0) ? sGo[qx][dx] : sGc[qx][dx]);
772 dxyz += sop[i22][qx][qy] * wx * wx * wy * wy * wDz * wDz;
775 dxyz += -(sop[i23][qx][qy] + sop[i32][qx][qy]) * wx * wx * wDy * wy * wDz * wz;
778 dxyz += sop[i33][qx][qy] * wx * wx * wDy * wDy * wz * wz;
785 dxyz += sop[i11][qx][qy] * wx * wx * wy * wy * wDz * wDz;
788 dxyz += -(sop[i13][qx][qy] + sop[i31][qx][qy]) * wDx * wx * wy * wy * wDz * wz;
791 dxyz += sop[i33][qx][qy] * wDx * wDx * wy * wy * wz * wz;
798 dxyz += sop[i11][qx][qy] * wx * wx * wDy * wDy * wz * wz;
801 dxyz += -(sop[i12][qx][qy] + sop[i21][qx][qy]) * wDx * wx * wDy * wy * wz * wz;
804 dxyz += sop[i22][qx][qy] * wDx * wDx * wy * wy * wz * wz;
815 MFEM_FOREACH_THREAD(dz,z,D1Dz)
817 MFEM_FOREACH_THREAD(dy,y,D1Dy)
819 MFEM_FOREACH_THREAD(dx,x,D1Dx)
821 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
826 osc += D1Dx * D1Dy * D1Dz;
832void PACurlCurlApply2D(
const int D1D,
835 const Array<real_t> &bo,
836 const Array<real_t> &bot,
837 const Array<real_t> &gc,
838 const Array<real_t> &gct,
839 const Vector &pa_data,
844template<
int T_D1D = 0,
int T_Q1D = 0>
845inline void PACurlCurlApply3D(
const int d1d,
847 const bool symmetric,
849 const Array<real_t> &bo,
850 const Array<real_t> &bc,
851 const Array<real_t> &bot,
852 const Array<real_t> &bct,
853 const Array<real_t> &gc,
854 const Array<real_t> &gct,
855 const Vector &pa_data,
860 "Error: d1d > HCURL_MAX_D1D");
862 "Error: q1d > HCURL_MAX_Q1D");
863 const int D1D = T_D1D ? T_D1D : d1d;
864 const int Q1D = T_Q1D ? T_Q1D : q1d;
866 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
867 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
868 auto Bot =
Reshape(bot.Read(), D1D-1, Q1D);
869 auto Bct =
Reshape(bct.Read(), D1D, Q1D);
870 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
871 auto Gct =
Reshape(gct.Read(), D1D, Q1D);
872 auto op =
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
873 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
874 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
886 constexpr int VDIM = 3;
887 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
888 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
889 const int D1D = T_D1D ? T_D1D : d1d;
890 const int Q1D = T_Q1D ? T_Q1D : q1d;
892 real_t curl[MQ1D][MQ1D][MQ1D][VDIM];
895 for (
int qz = 0; qz < Q1D; ++qz)
897 for (
int qy = 0; qy < Q1D; ++qy)
899 for (
int qx = 0; qx < Q1D; ++qx)
901 for (
int c = 0; c < VDIM; ++c)
903 curl[qz][qy][qx][c] = 0.0;
915 const int D1Dz = D1D;
916 const int D1Dy = D1D;
917 const int D1Dx = D1D - 1;
919 for (
int dz = 0; dz < D1Dz; ++dz)
921 real_t gradXY[MQ1D][MQ1D][2];
922 for (
int qy = 0; qy < Q1D; ++qy)
924 for (
int qx = 0; qx < Q1D; ++qx)
926 for (
int d = 0; d < 2; ++d)
928 gradXY[qy][qx][d] = 0.0;
933 for (
int dy = 0; dy < D1Dy; ++dy)
936 for (
int qx = 0; qx < Q1D; ++qx)
941 for (
int dx = 0; dx < D1Dx; ++dx)
943 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
944 for (
int qx = 0; qx < Q1D; ++qx)
946 massX[qx] += t * Bo(qx,dx);
950 for (
int qy = 0; qy < Q1D; ++qy)
952 const real_t wy = Bc(qy,dy);
953 const real_t wDy = Gc(qy,dy);
954 for (
int qx = 0; qx < Q1D; ++qx)
956 const real_t wx = massX[qx];
957 gradXY[qy][qx][0] += wx * wDy;
958 gradXY[qy][qx][1] += wx * wy;
963 for (
int qz = 0; qz < Q1D; ++qz)
965 const real_t wz = Bc(qz,dz);
966 const real_t wDz = Gc(qz,dz);
967 for (
int qy = 0; qy < Q1D; ++qy)
969 for (
int qx = 0; qx < Q1D; ++qx)
972 curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz;
973 curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;
979 osc += D1Dx * D1Dy * D1Dz;
984 const int D1Dz = D1D;
985 const int D1Dy = D1D - 1;
986 const int D1Dx = D1D;
988 for (
int dz = 0; dz < D1Dz; ++dz)
990 real_t gradXY[MQ1D][MQ1D][2];
991 for (
int qy = 0; qy < Q1D; ++qy)
993 for (
int qx = 0; qx < Q1D; ++qx)
995 for (
int d = 0; d < 2; ++d)
997 gradXY[qy][qx][d] = 0.0;
1002 for (
int dx = 0; dx < D1Dx; ++dx)
1005 for (
int qy = 0; qy < Q1D; ++qy)
1010 for (
int dy = 0; dy < D1Dy; ++dy)
1012 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1013 for (
int qy = 0; qy < Q1D; ++qy)
1015 massY[qy] += t * Bo(qy,dy);
1019 for (
int qx = 0; qx < Q1D; ++qx)
1021 const real_t wx = Bc(qx,dx);
1022 const real_t wDx = Gc(qx,dx);
1023 for (
int qy = 0; qy < Q1D; ++qy)
1025 const real_t wy = massY[qy];
1026 gradXY[qy][qx][0] += wDx * wy;
1027 gradXY[qy][qx][1] += wx * wy;
1032 for (
int qz = 0; qz < Q1D; ++qz)
1034 const real_t wz = Bc(qz,dz);
1035 const real_t wDz = Gc(qz,dz);
1036 for (
int qy = 0; qy < Q1D; ++qy)
1038 for (
int qx = 0; qx < Q1D; ++qx)
1041 curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz;
1042 curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;
1048 osc += D1Dx * D1Dy * D1Dz;
1053 const int D1Dz = D1D - 1;
1054 const int D1Dy = D1D;
1055 const int D1Dx = D1D;
1057 for (
int dx = 0; dx < D1Dx; ++dx)
1059 real_t gradYZ[MQ1D][MQ1D][2];
1060 for (
int qz = 0; qz < Q1D; ++qz)
1062 for (
int qy = 0; qy < Q1D; ++qy)
1064 for (
int d = 0; d < 2; ++d)
1066 gradYZ[qz][qy][d] = 0.0;
1071 for (
int dy = 0; dy < D1Dy; ++dy)
1074 for (
int qz = 0; qz < Q1D; ++qz)
1079 for (
int dz = 0; dz < D1Dz; ++dz)
1081 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1082 for (
int qz = 0; qz < Q1D; ++qz)
1084 massZ[qz] += t * Bo(qz,dz);
1088 for (
int qy = 0; qy < Q1D; ++qy)
1090 const real_t wy = Bc(qy,dy);
1091 const real_t wDy = Gc(qy,dy);
1092 for (
int qz = 0; qz < Q1D; ++qz)
1094 const real_t wz = massZ[qz];
1095 gradYZ[qz][qy][0] += wz * wy;
1096 gradYZ[qz][qy][1] += wz * wDy;
1101 for (
int qx = 0; qx < Q1D; ++qx)
1103 const real_t wx = Bc(qx,dx);
1104 const real_t wDx = Gc(qx,dx);
1106 for (
int qy = 0; qy < Q1D; ++qy)
1108 for (
int qz = 0; qz < Q1D; ++qz)
1111 curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx;
1112 curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx;
1120 for (
int qz = 0; qz < Q1D; ++qz)
1122 for (
int qy = 0; qy < Q1D; ++qy)
1124 for (
int qx = 0; qx < Q1D; ++qx)
1126 const real_t O11 = op(qx,qy,qz,0,e);
1127 const real_t O12 = op(qx,qy,qz,1,e);
1128 const real_t O13 = op(qx,qy,qz,2,e);
1129 const real_t O21 = symmetric ? O12 : op(qx,qy,qz,3,e);
1130 const real_t O22 = symmetric ? op(qx,qy,qz,3,e) : op(qx,qy,qz,4,e);
1131 const real_t O23 = symmetric ? op(qx,qy,qz,4,e) : op(qx,qy,qz,5,e);
1132 const real_t O31 = symmetric ? O13 : op(qx,qy,qz,6,e);
1133 const real_t O32 = symmetric ? O23 : op(qx,qy,qz,7,e);
1134 const real_t O33 = symmetric ? op(qx,qy,qz,5,e) : op(qx,qy,qz,8,e);
1136 const real_t c1 = (O11 * curl[qz][qy][qx][0]) + (O12 * curl[qz][qy][qx][1]) +
1137 (O13 * curl[qz][qy][qx][2]);
1138 const real_t c2 = (O21 * curl[qz][qy][qx][0]) + (O22 * curl[qz][qy][qx][1]) +
1139 (O23 * curl[qz][qy][qx][2]);
1140 const real_t c3 = (O31 * curl[qz][qy][qx][0]) + (O32 * curl[qz][qy][qx][1]) +
1141 (O33 * curl[qz][qy][qx][2]);
1143 curl[qz][qy][qx][0] = c1;
1144 curl[qz][qy][qx][1] = c2;
1145 curl[qz][qy][qx][2] = c3;
1153 const int D1Dz = D1D;
1154 const int D1Dy = D1D;
1155 const int D1Dx = D1D - 1;
1157 for (
int qz = 0; qz < Q1D; ++qz)
1159 real_t gradXY12[MD1D][MD1D];
1160 real_t gradXY21[MD1D][MD1D];
1162 for (
int dy = 0; dy < D1Dy; ++dy)
1164 for (
int dx = 0; dx < D1Dx; ++dx)
1166 gradXY12[dy][dx] = 0.0;
1167 gradXY21[dy][dx] = 0.0;
1170 for (
int qy = 0; qy < Q1D; ++qy)
1173 for (
int dx = 0; dx < D1Dx; ++dx)
1175 for (
int n = 0; n < 2; ++n)
1180 for (
int qx = 0; qx < Q1D; ++qx)
1182 for (
int dx = 0; dx < D1Dx; ++dx)
1184 const real_t wx = Bot(dx,qx);
1186 massX[dx][0] += wx * curl[qz][qy][qx][1];
1187 massX[dx][1] += wx * curl[qz][qy][qx][2];
1190 for (
int dy = 0; dy < D1Dy; ++dy)
1192 const real_t wy = Bct(dy,qy);
1193 const real_t wDy = Gct(dy,qy);
1195 for (
int dx = 0; dx < D1Dx; ++dx)
1197 gradXY21[dy][dx] += massX[dx][0] * wy;
1198 gradXY12[dy][dx] += massX[dx][1] * wDy;
1203 for (
int dz = 0; dz < D1Dz; ++dz)
1205 const real_t wz = Bct(dz,qz);
1206 const real_t wDz = Gct(dz,qz);
1207 for (
int dy = 0; dy < D1Dy; ++dy)
1209 for (
int dx = 0; dx < D1Dx; ++dx)
1213 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
1214 e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
1220 osc += D1Dx * D1Dy * D1Dz;
1225 const int D1Dz = D1D;
1226 const int D1Dy = D1D - 1;
1227 const int D1Dx = D1D;
1229 for (
int qz = 0; qz < Q1D; ++qz)
1231 real_t gradXY02[MD1D][MD1D];
1232 real_t gradXY20[MD1D][MD1D];
1234 for (
int dy = 0; dy < D1Dy; ++dy)
1236 for (
int dx = 0; dx < D1Dx; ++dx)
1238 gradXY02[dy][dx] = 0.0;
1239 gradXY20[dy][dx] = 0.0;
1242 for (
int qx = 0; qx < Q1D; ++qx)
1245 for (
int dy = 0; dy < D1Dy; ++dy)
1250 for (
int qy = 0; qy < Q1D; ++qy)
1252 for (
int dy = 0; dy < D1Dy; ++dy)
1254 const real_t wy = Bot(dy,qy);
1256 massY[dy][0] += wy * curl[qz][qy][qx][2];
1257 massY[dy][1] += wy * curl[qz][qy][qx][0];
1260 for (
int dx = 0; dx < D1Dx; ++dx)
1262 const real_t wx = Bct(dx,qx);
1263 const real_t wDx = Gct(dx,qx);
1265 for (
int dy = 0; dy < D1Dy; ++dy)
1267 gradXY02[dy][dx] += massY[dy][0] * wDx;
1268 gradXY20[dy][dx] += massY[dy][1] * wx;
1273 for (
int dz = 0; dz < D1Dz; ++dz)
1275 const real_t wz = Bct(dz,qz);
1276 const real_t wDz = Gct(dz,qz);
1277 for (
int dy = 0; dy < D1Dy; ++dy)
1279 for (
int dx = 0; dx < D1Dx; ++dx)
1283 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
1284 e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
1290 osc += D1Dx * D1Dy * D1Dz;
1295 const int D1Dz = D1D - 1;
1296 const int D1Dy = D1D;
1297 const int D1Dx = D1D;
1299 for (
int qx = 0; qx < Q1D; ++qx)
1301 real_t gradYZ01[MD1D][MD1D];
1302 real_t gradYZ10[MD1D][MD1D];
1304 for (
int dy = 0; dy < D1Dy; ++dy)
1306 for (
int dz = 0; dz < D1Dz; ++dz)
1308 gradYZ01[dz][dy] = 0.0;
1309 gradYZ10[dz][dy] = 0.0;
1312 for (
int qy = 0; qy < Q1D; ++qy)
1315 for (
int dz = 0; dz < D1Dz; ++dz)
1317 for (
int n = 0; n < 2; ++n)
1322 for (
int qz = 0; qz < Q1D; ++qz)
1324 for (
int dz = 0; dz < D1Dz; ++dz)
1326 const real_t wz = Bot(dz,qz);
1328 massZ[dz][0] += wz * curl[qz][qy][qx][0];
1329 massZ[dz][1] += wz * curl[qz][qy][qx][1];
1332 for (
int dy = 0; dy < D1Dy; ++dy)
1334 const real_t wy = Bct(dy,qy);
1335 const real_t wDy = Gct(dy,qy);
1337 for (
int dz = 0; dz < D1Dz; ++dz)
1339 gradYZ01[dz][dy] += wy * massZ[dz][1];
1340 gradYZ10[dz][dy] += wDy * massZ[dz][0];
1345 for (
int dx = 0; dx < D1Dx; ++dx)
1347 const real_t wx = Bct(dx,qx);
1348 const real_t wDx = Gct(dx,qx);
1350 for (
int dy = 0; dy < D1Dy; ++dy)
1352 for (
int dz = 0; dz < D1Dz; ++dz)
1356 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
1357 e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
1367template<
int T_D1D = 0,
int T_Q1D = 0>
1368inline void SmemPACurlCurlApply3D(
const int d1d,
1370 const bool symmetric,
1372 const Array<real_t> &bo,
1373 const Array<real_t> &bc,
1374 const Array<real_t> &bot,
1375 const Array<real_t> &bct,
1376 const Array<real_t> &gc,
1377 const Array<real_t> &gct,
1378 const Vector &pa_data,
1383 "Error: d1d > HCURL_MAX_D1D");
1385 "Error: q1d > HCURL_MAX_Q1D");
1386 const int D1D = T_D1D ? T_D1D : d1d;
1387 const int Q1D = T_Q1D ? T_Q1D : q1d;
1395 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
1396 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
1397 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
1398 auto op =
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
1399 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
1400 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
1402 const int s = symmetric ? 6 : 9;
1404 auto device_kernel = [=] MFEM_DEVICE (
int e)
1406 constexpr int VDIM = 3;
1407 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
1408 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
1409 const int D1D = T_D1D ? T_D1D : d1d;
1410 const int Q1D = T_Q1D ? T_Q1D : q1d;
1412 MFEM_SHARED
real_t sBo[MD1D][MQ1D];
1413 MFEM_SHARED
real_t sBc[MD1D][MQ1D];
1414 MFEM_SHARED
real_t sGc[MD1D][MQ1D];
1417 MFEM_SHARED
real_t sop[9][MQ1D][MQ1D];
1418 MFEM_SHARED
real_t curl[MQ1D][MQ1D][3];
1420 MFEM_SHARED
real_t sX[MD1D][MD1D][MD1D];
1422 MFEM_FOREACH_THREAD(qx,x,Q1D)
1424 MFEM_FOREACH_THREAD(qy,y,Q1D)
1426 MFEM_FOREACH_THREAD(qz,z,Q1D)
1428 for (
int i=0; i<s; ++i)
1430 ope[i] = op(qx,qy,qz,i,e);
1436 const int tidx = MFEM_THREAD_ID(x);
1437 const int tidy = MFEM_THREAD_ID(y);
1438 const int tidz = MFEM_THREAD_ID(z);
1442 MFEM_FOREACH_THREAD(d,y,D1D)
1444 MFEM_FOREACH_THREAD(q,x,Q1D)
1446 sBc[d][q] = Bc(q,d);
1447 sGc[d][q] = Gc(q,d);
1450 sBo[d][q] = Bo(q,d);
1457 for (
int qz=0; qz < Q1D; ++qz)
1461 MFEM_FOREACH_THREAD(qy,y,Q1D)
1463 MFEM_FOREACH_THREAD(qx,x,Q1D)
1465 for (
int i=0; i<3; ++i)
1467 curl[qy][qx][i] = 0.0;
1474 for (
int c = 0; c < VDIM; ++c)
1476 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
1477 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
1478 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
1480 MFEM_FOREACH_THREAD(dz,z,D1Dz)
1482 MFEM_FOREACH_THREAD(dy,y,D1Dy)
1484 MFEM_FOREACH_THREAD(dx,x,D1Dx)
1486 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1496 for (
int i=0; i<s; ++i)
1498 sop[i][tidx][tidy] = ope[i];
1502 MFEM_FOREACH_THREAD(qy,y,Q1D)
1504 MFEM_FOREACH_THREAD(qx,x,Q1D)
1514 for (
int dz = 0; dz < D1Dz; ++dz)
1516 const real_t wz = sBc[dz][qz];
1517 const real_t wDz = sGc[dz][qz];
1519 for (
int dy = 0; dy < D1Dy; ++dy)
1521 const real_t wy = sBc[dy][qy];
1522 const real_t wDy = sGc[dy][qy];
1524 for (
int dx = 0; dx < D1Dx; ++dx)
1526 const real_t wx = sX[dz][dy][dx] * sBo[dx][qx];
1533 curl[qy][qx][1] += v;
1534 curl[qy][qx][2] -=
u;
1540 for (
int dz = 0; dz < D1Dz; ++dz)
1542 const real_t wz = sBc[dz][qz];
1543 const real_t wDz = sGc[dz][qz];
1545 for (
int dy = 0; dy < D1Dy; ++dy)
1547 const real_t wy = sBo[dy][qy];
1549 for (
int dx = 0; dx < D1Dx; ++dx)
1551 const real_t t = sX[dz][dy][dx];
1552 const real_t wx = t * sBc[dx][qx];
1553 const real_t wDx = t * sGc[dx][qx];
1561 curl[qy][qx][0] -= v;
1562 curl[qy][qx][2] +=
u;
1568 for (
int dz = 0; dz < D1Dz; ++dz)
1570 const real_t wz = sBo[dz][qz];
1572 for (
int dy = 0; dy < D1Dy; ++dy)
1574 const real_t wy = sBc[dy][qy];
1575 const real_t wDy = sGc[dy][qy];
1577 for (
int dx = 0; dx < D1Dx; ++dx)
1579 const real_t t = sX[dz][dy][dx];
1580 const real_t wx = t * sBc[dx][qx];
1581 const real_t wDx = t * sGc[dx][qx];
1589 curl[qy][qx][0] += v;
1590 curl[qy][qx][1] -=
u;
1596 osc += D1Dx * D1Dy * D1Dz;
1604 MFEM_FOREACH_THREAD(dz,z,D1D)
1606 const real_t wcz = sBc[dz][qz];
1607 const real_t wcDz = sGc[dz][qz];
1608 const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
1610 MFEM_FOREACH_THREAD(dy,y,D1D)
1612 MFEM_FOREACH_THREAD(dx,x,D1D)
1614 for (
int qy = 0; qy < Q1D; ++qy)
1616 const real_t wcy = sBc[dy][qy];
1617 const real_t wcDy = sGc[dy][qy];
1618 const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
1620 for (
int qx = 0; qx < Q1D; ++qx)
1622 const real_t O11 = sop[0][qx][qy];
1623 const real_t O12 = sop[1][qx][qy];
1624 const real_t O13 = sop[2][qx][qy];
1625 const real_t O21 = symmetric ? O12 : sop[3][qx][qy];
1626 const real_t O22 = symmetric ? sop[3][qx][qy] : sop[4][qx][qy];
1627 const real_t O23 = symmetric ? sop[4][qx][qy] : sop[5][qx][qy];
1628 const real_t O31 = symmetric ? O13 : sop[6][qx][qy];
1629 const real_t O32 = symmetric ? O23 : sop[7][qx][qy];
1630 const real_t O33 = symmetric ? sop[5][qx][qy] : sop[8][qx][qy];
1632 const real_t c1 = (O11 * curl[qy][qx][0]) + (O12 * curl[qy][qx][1]) +
1633 (O13 * curl[qy][qx][2]);
1634 const real_t c2 = (O21 * curl[qy][qx][0]) + (O22 * curl[qy][qx][1]) +
1635 (O23 * curl[qy][qx][2]);
1636 const real_t c3 = (O31 * curl[qy][qx][0]) + (O32 * curl[qy][qx][1]) +
1637 (O33 * curl[qy][qx][2]);
1639 const real_t wcx = sBc[dx][qx];
1640 const real_t wDx = sGc[dx][qx];
1646 const real_t wx = sBo[dx][qx];
1647 dxyz1 += (wx * c2 * wcy * wcDz) - (wx * c3 * wcDy * wcz);
1652 dxyz2 += (-wy * c1 * wcx * wcDz) + (wy * c3 * wDx * wcz);
1656 dxyz3 += (wcDy * wz * c1 * wcx) - (wcy * wz * c2 * wDx);
1665 MFEM_FOREACH_THREAD(dz,z,D1D)
1667 MFEM_FOREACH_THREAD(dy,y,D1D)
1669 MFEM_FOREACH_THREAD(dx,x,D1D)
1673 Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
1677 Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
1681 Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
1689 auto host_kernel = [&] MFEM_LAMBDA (
int)
1691 MFEM_ABORT_KERNEL(
"This kernel should only be used on GPU.");
1694 ForallWrap<3>(
true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
1698void PAHcurlL2Setup2D(
const int Q1D,
1700 const Array<real_t> &w,
1705void PAHcurlL2Setup3D(
const int NQ,
1708 const Array<real_t> &w,
1713void PAHcurlL2Apply2D(
const int D1D,
1717 const Array<real_t> &bo,
1718 const Array<real_t> &bot,
1719 const Array<real_t> &bt,
1720 const Array<real_t> &gc,
1721 const Vector &pa_data,
1726void PAHcurlL2ApplyTranspose2D(
const int D1D,
1730 const Array<real_t> &bo,
1731 const Array<real_t> &bot,
1732 const Array<real_t> &
b,
1733 const Array<real_t> &gct,
1734 const Vector &pa_data,
1739template<
int T_D1D = 0,
int T_Q1D = 0>
1740inline void PAHcurlL2Apply3D(
const int d1d,
1744 const Array<real_t> &bo,
1745 const Array<real_t> &bc,
1746 const Array<real_t> &bot,
1747 const Array<real_t> &bct,
1748 const Array<real_t> &gc,
1749 const Vector &pa_data,
1754 "Error: d1d > HCURL_MAX_D1D");
1756 "Error: q1d > HCURL_MAX_Q1D");
1757 const int D1D = T_D1D ? T_D1D : d1d;
1758 const int Q1D = T_Q1D ? T_Q1D : q1d;
1760 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
1761 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
1762 auto Bot =
Reshape(bot.Read(), D1D-1, Q1D);
1763 auto Bct =
Reshape(bct.Read(), D1D, Q1D);
1764 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
1765 auto op =
Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
1766 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
1767 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
1780 constexpr int VDIM = 3;
1781 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
1782 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
1783 const int D1D = T_D1D ? T_D1D : d1d;
1784 const int Q1D = T_Q1D ? T_Q1D : q1d;
1786 real_t curl[MQ1D][MQ1D][MQ1D][VDIM];
1789 for (
int qz = 0; qz < Q1D; ++qz)
1791 for (
int qy = 0; qy < Q1D; ++qy)
1793 for (
int qx = 0; qx < Q1D; ++qx)
1795 for (
int c = 0; c < VDIM; ++c)
1797 curl[qz][qy][qx][c] = 0.0;
1809 const int D1Dz = D1D;
1810 const int D1Dy = D1D;
1811 const int D1Dx = D1D - 1;
1813 for (
int dz = 0; dz < D1Dz; ++dz)
1815 real_t gradXY[MQ1D][MQ1D][2];
1816 for (
int qy = 0; qy < Q1D; ++qy)
1818 for (
int qx = 0; qx < Q1D; ++qx)
1820 for (
int d = 0; d < 2; ++d)
1822 gradXY[qy][qx][d] = 0.0;
1827 for (
int dy = 0; dy < D1Dy; ++dy)
1830 for (
int qx = 0; qx < Q1D; ++qx)
1835 for (
int dx = 0; dx < D1Dx; ++dx)
1837 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1838 for (
int qx = 0; qx < Q1D; ++qx)
1840 massX[qx] += t * Bo(qx,dx);
1844 for (
int qy = 0; qy < Q1D; ++qy)
1846 const real_t wy = Bc(qy,dy);
1847 const real_t wDy = Gc(qy,dy);
1848 for (
int qx = 0; qx < Q1D; ++qx)
1850 const real_t wx = massX[qx];
1851 gradXY[qy][qx][0] += wx * wDy;
1852 gradXY[qy][qx][1] += wx * wy;
1857 for (
int qz = 0; qz < Q1D; ++qz)
1859 const real_t wz = Bc(qz,dz);
1860 const real_t wDz = Gc(qz,dz);
1861 for (
int qy = 0; qy < Q1D; ++qy)
1863 for (
int qx = 0; qx < Q1D; ++qx)
1866 curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz;
1867 curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;
1873 osc += D1Dx * D1Dy * D1Dz;
1878 const int D1Dz = D1D;
1879 const int D1Dy = D1D - 1;
1880 const int D1Dx = D1D;
1882 for (
int dz = 0; dz < D1Dz; ++dz)
1884 real_t gradXY[MQ1D][MQ1D][2];
1885 for (
int qy = 0; qy < Q1D; ++qy)
1887 for (
int qx = 0; qx < Q1D; ++qx)
1889 for (
int d = 0; d < 2; ++d)
1891 gradXY[qy][qx][d] = 0.0;
1896 for (
int dx = 0; dx < D1Dx; ++dx)
1899 for (
int qy = 0; qy < Q1D; ++qy)
1904 for (
int dy = 0; dy < D1Dy; ++dy)
1906 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1907 for (
int qy = 0; qy < Q1D; ++qy)
1909 massY[qy] += t * Bo(qy,dy);
1913 for (
int qx = 0; qx < Q1D; ++qx)
1915 const real_t wx = Bc(qx,dx);
1916 const real_t wDx = Gc(qx,dx);
1917 for (
int qy = 0; qy < Q1D; ++qy)
1919 const real_t wy = massY[qy];
1920 gradXY[qy][qx][0] += wDx * wy;
1921 gradXY[qy][qx][1] += wx * wy;
1926 for (
int qz = 0; qz < Q1D; ++qz)
1928 const real_t wz = Bc(qz,dz);
1929 const real_t wDz = Gc(qz,dz);
1930 for (
int qy = 0; qy < Q1D; ++qy)
1932 for (
int qx = 0; qx < Q1D; ++qx)
1935 curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz;
1936 curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;
1942 osc += D1Dx * D1Dy * D1Dz;
1947 const int D1Dz = D1D - 1;
1948 const int D1Dy = D1D;
1949 const int D1Dx = D1D;
1951 for (
int dx = 0; dx < D1Dx; ++dx)
1953 real_t gradYZ[MQ1D][MQ1D][2];
1954 for (
int qz = 0; qz < Q1D; ++qz)
1956 for (
int qy = 0; qy < Q1D; ++qy)
1958 for (
int d = 0; d < 2; ++d)
1960 gradYZ[qz][qy][d] = 0.0;
1965 for (
int dy = 0; dy < D1Dy; ++dy)
1968 for (
int qz = 0; qz < Q1D; ++qz)
1973 for (
int dz = 0; dz < D1Dz; ++dz)
1975 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1976 for (
int qz = 0; qz < Q1D; ++qz)
1978 massZ[qz] += t * Bo(qz,dz);
1982 for (
int qy = 0; qy < Q1D; ++qy)
1984 const real_t wy = Bc(qy,dy);
1985 const real_t wDy = Gc(qy,dy);
1986 for (
int qz = 0; qz < Q1D; ++qz)
1988 const real_t wz = massZ[qz];
1989 gradYZ[qz][qy][0] += wz * wy;
1990 gradYZ[qz][qy][1] += wz * wDy;
1995 for (
int qx = 0; qx < Q1D; ++qx)
1997 const real_t wx = Bc(qx,dx);
1998 const real_t wDx = Gc(qx,dx);
2000 for (
int qy = 0; qy < Q1D; ++qy)
2002 for (
int qz = 0; qz < Q1D; ++qz)
2005 curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx;
2006 curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx;
2014 for (
int qz = 0; qz < Q1D; ++qz)
2016 for (
int qy = 0; qy < Q1D; ++qy)
2018 for (
int qx = 0; qx < Q1D; ++qx)
2020 const real_t O11 = op(0,qx,qy,qz,e);
2023 for (
int c = 0; c < VDIM; ++c)
2025 curl[qz][qy][qx][c] *= O11;
2030 const real_t O21 = op(1,qx,qy,qz,e);
2031 const real_t O31 = op(2,qx,qy,qz,e);
2032 const real_t O12 = op(3,qx,qy,qz,e);
2033 const real_t O22 = op(4,qx,qy,qz,e);
2034 const real_t O32 = op(5,qx,qy,qz,e);
2035 const real_t O13 = op(6,qx,qy,qz,e);
2036 const real_t O23 = op(7,qx,qy,qz,e);
2037 const real_t O33 = op(8,qx,qy,qz,e);
2038 const real_t curlX = curl[qz][qy][qx][0];
2039 const real_t curlY = curl[qz][qy][qx][1];
2040 const real_t curlZ = curl[qz][qy][qx][2];
2041 curl[qz][qy][qx][0] = (O11*curlX)+(O12*curlY)+(O13*curlZ);
2042 curl[qz][qy][qx][1] = (O21*curlX)+(O22*curlY)+(O23*curlZ);
2043 curl[qz][qy][qx][2] = (O31*curlX)+(O32*curlY)+(O33*curlZ);
2049 for (
int qz = 0; qz < Q1D; ++qz)
2051 real_t massXY[MD1D][MD1D];
2055 for (
int c = 0; c < VDIM; ++c)
2057 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2058 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2059 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2061 for (
int dy = 0; dy < D1Dy; ++dy)
2063 for (
int dx = 0; dx < D1Dx; ++dx)
2068 for (
int qy = 0; qy < Q1D; ++qy)
2071 for (
int dx = 0; dx < D1Dx; ++dx)
2075 for (
int qx = 0; qx < Q1D; ++qx)
2077 for (
int dx = 0; dx < D1Dx; ++dx)
2079 massX[dx] += curl[qz][qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
2083 for (
int dy = 0; dy < D1Dy; ++dy)
2085 const real_t wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
2086 for (
int dx = 0; dx < D1Dx; ++dx)
2088 massXY[dy][dx] += massX[dx] * wy;
2093 for (
int dz = 0; dz < D1Dz; ++dz)
2095 const real_t wz = (c == 2) ? Bot(dz,qz) : Bct(dz,qz);
2096 for (
int dy = 0; dy < D1Dy; ++dy)
2098 for (
int dx = 0; dx < D1Dx; ++dx)
2100 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += massXY[dy][dx] * wz;
2105 osc += D1Dx * D1Dy * D1Dz;
2112template<
int T_D1D = 0,
int T_Q1D = 0>
2113inline void SmemPAHcurlL2Apply3D(
const int d1d,
2117 const Array<real_t> &bo,
2118 const Array<real_t> &bc,
2119 const Array<real_t> &gc,
2120 const Vector &pa_data,
2125 "Error: d1d > HCURL_MAX_D1D");
2127 "Error: q1d > HCURL_MAX_Q1D");
2128 const int D1D = T_D1D ? T_D1D : d1d;
2129 const int Q1D = T_Q1D ? T_Q1D : q1d;
2131 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
2132 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
2133 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
2134 auto op =
Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
2135 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
2136 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
2138 auto device_kernel = [=] MFEM_DEVICE (
int e)
2140 constexpr int VDIM = 3;
2141 constexpr int maxCoeffDim = 9;
2142 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
2143 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
2144 const int D1D = T_D1D ? T_D1D : d1d;
2145 const int Q1D = T_Q1D ? T_Q1D : q1d;
2147 MFEM_SHARED
real_t sBo[MD1D][MQ1D];
2148 MFEM_SHARED
real_t sBc[MD1D][MQ1D];
2149 MFEM_SHARED
real_t sGc[MD1D][MQ1D];
2152 MFEM_SHARED
real_t sop[maxCoeffDim][MQ1D][MQ1D];
2153 MFEM_SHARED
real_t curl[MQ1D][MQ1D][3];
2155 MFEM_SHARED
real_t sX[MD1D][MD1D][MD1D];
2157 MFEM_FOREACH_THREAD(qx,x,Q1D)
2159 MFEM_FOREACH_THREAD(qy,y,Q1D)
2161 MFEM_FOREACH_THREAD(qz,z,Q1D)
2163 for (
int i=0; i<coeffDim; ++i)
2165 opc[i] = op(i,qx,qy,qz,e);
2171 const int tidx = MFEM_THREAD_ID(x);
2172 const int tidy = MFEM_THREAD_ID(y);
2173 const int tidz = MFEM_THREAD_ID(z);
2177 MFEM_FOREACH_THREAD(d,y,D1D)
2179 MFEM_FOREACH_THREAD(q,x,Q1D)
2181 sBc[d][q] = Bc(q,d);
2182 sGc[d][q] = Gc(q,d);
2185 sBo[d][q] = Bo(q,d);
2192 for (
int qz=0; qz < Q1D; ++qz)
2196 MFEM_FOREACH_THREAD(qy,y,Q1D)
2198 MFEM_FOREACH_THREAD(qx,x,Q1D)
2200 for (
int i=0; i<3; ++i)
2202 curl[qy][qx][i] = 0.0;
2209 for (
int c = 0; c < VDIM; ++c)
2211 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2212 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2213 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2215 MFEM_FOREACH_THREAD(dz,z,D1Dz)
2217 MFEM_FOREACH_THREAD(dy,y,D1Dy)
2219 MFEM_FOREACH_THREAD(dx,x,D1Dx)
2221 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2231 for (
int i=0; i<coeffDim; ++i)
2233 sop[i][tidx][tidy] = opc[i];
2237 MFEM_FOREACH_THREAD(qy,y,Q1D)
2239 MFEM_FOREACH_THREAD(qx,x,Q1D)
2249 for (
int dz = 0; dz < D1Dz; ++dz)
2251 const real_t wz = sBc[dz][qz];
2252 const real_t wDz = sGc[dz][qz];
2254 for (
int dy = 0; dy < D1Dy; ++dy)
2256 const real_t wy = sBc[dy][qy];
2257 const real_t wDy = sGc[dy][qy];
2259 for (
int dx = 0; dx < D1Dx; ++dx)
2261 const real_t wx = sX[dz][dy][dx] * sBo[dx][qx];
2268 curl[qy][qx][1] += v;
2269 curl[qy][qx][2] -=
u;
2275 for (
int dz = 0; dz < D1Dz; ++dz)
2277 const real_t wz = sBc[dz][qz];
2278 const real_t wDz = sGc[dz][qz];
2280 for (
int dy = 0; dy < D1Dy; ++dy)
2282 const real_t wy = sBo[dy][qy];
2284 for (
int dx = 0; dx < D1Dx; ++dx)
2286 const real_t t = sX[dz][dy][dx];
2287 const real_t wx = t * sBc[dx][qx];
2288 const real_t wDx = t * sGc[dx][qx];
2296 curl[qy][qx][0] -= v;
2297 curl[qy][qx][2] +=
u;
2303 for (
int dz = 0; dz < D1Dz; ++dz)
2305 const real_t wz = sBo[dz][qz];
2307 for (
int dy = 0; dy < D1Dy; ++dy)
2309 const real_t wy = sBc[dy][qy];
2310 const real_t wDy = sGc[dy][qy];
2312 for (
int dx = 0; dx < D1Dx; ++dx)
2314 const real_t t = sX[dz][dy][dx];
2315 const real_t wx = t * sBc[dx][qx];
2316 const real_t wDx = t * sGc[dx][qx];
2324 curl[qy][qx][0] += v;
2325 curl[qy][qx][1] -=
u;
2331 osc += D1Dx * D1Dy * D1Dz;
2339 MFEM_FOREACH_THREAD(dz,z,D1D)
2341 const real_t wcz = sBc[dz][qz];
2342 const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
2344 MFEM_FOREACH_THREAD(dy,y,D1D)
2346 MFEM_FOREACH_THREAD(dx,x,D1D)
2348 for (
int qy = 0; qy < Q1D; ++qy)
2350 const real_t wcy = sBc[dy][qy];
2351 const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
2353 for (
int qx = 0; qx < Q1D; ++qx)
2355 const real_t O11 = sop[0][qx][qy];
2359 c1 = O11 * curl[qy][qx][0];
2360 c2 = O11 * curl[qy][qx][1];
2361 c3 = O11 * curl[qy][qx][2];
2365 const real_t O21 = sop[1][qx][qy];
2366 const real_t O31 = sop[2][qx][qy];
2367 const real_t O12 = sop[3][qx][qy];
2368 const real_t O22 = sop[4][qx][qy];
2369 const real_t O32 = sop[5][qx][qy];
2370 const real_t O13 = sop[6][qx][qy];
2371 const real_t O23 = sop[7][qx][qy];
2372 const real_t O33 = sop[8][qx][qy];
2373 c1 = (O11*curl[qy][qx][0])+(O12*curl[qy][qx][1])+(O13*curl[qy][qx][2]);
2374 c2 = (O21*curl[qy][qx][0])+(O22*curl[qy][qx][1])+(O23*curl[qy][qx][2]);
2375 c3 = (O31*curl[qy][qx][0])+(O32*curl[qy][qx][1])+(O33*curl[qy][qx][2]);
2378 const real_t wcx = sBc[dx][qx];
2382 const real_t wx = sBo[dx][qx];
2383 dxyz1 += c1 * wx * wcy * wcz;
2386 dxyz2 += c2 * wcx * wy * wcz;
2387 dxyz3 += c3 * wcx * wcy * wz;
2396 MFEM_FOREACH_THREAD(dz,z,D1D)
2398 MFEM_FOREACH_THREAD(dy,y,D1D)
2400 MFEM_FOREACH_THREAD(dx,x,D1D)
2404 Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
2408 Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
2412 Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
2420 auto host_kernel = [&] MFEM_LAMBDA (
int)
2422 MFEM_ABORT_KERNEL(
"This kernel should only be used on GPU.");
2425 ForallWrap<3>(
true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
2429template<
int T_D1D = 0,
int T_Q1D = 0>
2430inline void PAHcurlL2ApplyTranspose3D(
const int d1d,
2434 const Array<real_t> &bo,
2435 const Array<real_t> &bc,
2436 const Array<real_t> &bot,
2437 const Array<real_t> &bct,
2438 const Array<real_t> &gct,
2439 const Vector &pa_data,
2445 "Error: d1d > HCURL_MAX_D1D");
2447 "Error: q1d > HCURL_MAX_Q1D");
2448 const int D1D = T_D1D ? T_D1D : d1d;
2449 const int Q1D = T_Q1D ? T_Q1D : q1d;
2451 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
2452 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
2453 auto Bot =
Reshape(bot.Read(), D1D-1, Q1D);
2454 auto Bct =
Reshape(bct.Read(), D1D, Q1D);
2455 auto Gct =
Reshape(gct.Read(), D1D, Q1D);
2456 auto op =
Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
2457 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
2458 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
2462 constexpr int VDIM = 3;
2463 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
2464 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
2465 const int D1D = T_D1D ? T_D1D : d1d;
2466 const int Q1D = T_Q1D ? T_Q1D : q1d;
2468 real_t mass[MQ1D][MQ1D][MQ1D][VDIM];
2470 for (
int qz = 0; qz < Q1D; ++qz)
2472 for (
int qy = 0; qy < Q1D; ++qy)
2474 for (
int qx = 0; qx < Q1D; ++qx)
2476 for (
int c = 0; c < VDIM; ++c)
2478 mass[qz][qy][qx][c] = 0.0;
2486 for (
int c = 0; c < VDIM; ++c)
2488 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2489 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2490 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2492 for (
int dz = 0; dz < D1Dz; ++dz)
2494 real_t massXY[MQ1D][MQ1D];
2495 for (
int qy = 0; qy < Q1D; ++qy)
2497 for (
int qx = 0; qx < Q1D; ++qx)
2499 massXY[qy][qx] = 0.0;
2503 for (
int dy = 0; dy < D1Dy; ++dy)
2506 for (
int qx = 0; qx < Q1D; ++qx)
2511 for (
int dx = 0; dx < D1Dx; ++dx)
2513 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2514 for (
int qx = 0; qx < Q1D; ++qx)
2516 massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
2520 for (
int qy = 0; qy < Q1D; ++qy)
2522 const real_t wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
2523 for (
int qx = 0; qx < Q1D; ++qx)
2525 const real_t wx = massX[qx];
2526 massXY[qy][qx] += wx * wy;
2531 for (
int qz = 0; qz < Q1D; ++qz)
2533 const real_t wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
2534 for (
int qy = 0; qy < Q1D; ++qy)
2536 for (
int qx = 0; qx < Q1D; ++qx)
2538 mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
2544 osc += D1Dx * D1Dy * D1Dz;
2548 for (
int qz = 0; qz < Q1D; ++qz)
2550 for (
int qy = 0; qy < Q1D; ++qy)
2552 for (
int qx = 0; qx < Q1D; ++qx)
2554 const real_t O11 = op(0,qx,qy,qz,e);
2557 for (
int c = 0; c < VDIM; ++c)
2559 mass[qz][qy][qx][c] *= O11;
2564 const real_t O12 = op(1,qx,qy,qz,e);
2565 const real_t O13 = op(2,qx,qy,qz,e);
2566 const real_t O21 = op(3,qx,qy,qz,e);
2567 const real_t O22 = op(4,qx,qy,qz,e);
2568 const real_t O23 = op(5,qx,qy,qz,e);
2569 const real_t O31 = op(6,qx,qy,qz,e);
2570 const real_t O32 = op(7,qx,qy,qz,e);
2571 const real_t O33 = op(8,qx,qy,qz,e);
2572 const real_t massX = mass[qz][qy][qx][0];
2573 const real_t massY = mass[qz][qy][qx][1];
2574 const real_t massZ = mass[qz][qy][qx][2];
2575 mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
2576 mass[qz][qy][qx][1] = (O21*massX)+(O22*massY)+(O23*massZ);
2577 mass[qz][qy][qx][2] = (O31*massX)+(O32*massY)+(O33*massZ);
2586 const int D1Dz = D1D;
2587 const int D1Dy = D1D;
2588 const int D1Dx = D1D - 1;
2590 for (
int qz = 0; qz < Q1D; ++qz)
2592 real_t gradXY12[MD1D][MD1D];
2593 real_t gradXY21[MD1D][MD1D];
2595 for (
int dy = 0; dy < D1Dy; ++dy)
2597 for (
int dx = 0; dx < D1Dx; ++dx)
2599 gradXY12[dy][dx] = 0.0;
2600 gradXY21[dy][dx] = 0.0;
2603 for (
int qy = 0; qy < Q1D; ++qy)
2606 for (
int dx = 0; dx < D1Dx; ++dx)
2608 for (
int n = 0; n < 2; ++n)
2613 for (
int qx = 0; qx < Q1D; ++qx)
2615 for (
int dx = 0; dx < D1Dx; ++dx)
2617 const real_t wx = Bot(dx,qx);
2619 massX[dx][0] += wx * mass[qz][qy][qx][1];
2620 massX[dx][1] += wx * mass[qz][qy][qx][2];
2623 for (
int dy = 0; dy < D1Dy; ++dy)
2625 const real_t wy = Bct(dy,qy);
2626 const real_t wDy = Gct(dy,qy);
2628 for (
int dx = 0; dx < D1Dx; ++dx)
2630 gradXY21[dy][dx] += massX[dx][0] * wy;
2631 gradXY12[dy][dx] += massX[dx][1] * wDy;
2636 for (
int dz = 0; dz < D1Dz; ++dz)
2638 const real_t wz = Bct(dz,qz);
2639 const real_t wDz = Gct(dz,qz);
2640 for (
int dy = 0; dy < D1Dy; ++dy)
2642 for (
int dx = 0; dx < D1Dx; ++dx)
2646 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
2647 e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
2653 osc += D1Dx * D1Dy * D1Dz;
2658 const int D1Dz = D1D;
2659 const int D1Dy = D1D - 1;
2660 const int D1Dx = D1D;
2662 for (
int qz = 0; qz < Q1D; ++qz)
2664 real_t gradXY02[MD1D][MD1D];
2665 real_t gradXY20[MD1D][MD1D];
2667 for (
int dy = 0; dy < D1Dy; ++dy)
2669 for (
int dx = 0; dx < D1Dx; ++dx)
2671 gradXY02[dy][dx] = 0.0;
2672 gradXY20[dy][dx] = 0.0;
2675 for (
int qx = 0; qx < Q1D; ++qx)
2678 for (
int dy = 0; dy < D1Dy; ++dy)
2683 for (
int qy = 0; qy < Q1D; ++qy)
2685 for (
int dy = 0; dy < D1Dy; ++dy)
2687 const real_t wy = Bot(dy,qy);
2689 massY[dy][0] += wy * mass[qz][qy][qx][2];
2690 massY[dy][1] += wy * mass[qz][qy][qx][0];
2693 for (
int dx = 0; dx < D1Dx; ++dx)
2695 const real_t wx = Bct(dx,qx);
2696 const real_t wDx = Gct(dx,qx);
2698 for (
int dy = 0; dy < D1Dy; ++dy)
2700 gradXY02[dy][dx] += massY[dy][0] * wDx;
2701 gradXY20[dy][dx] += massY[dy][1] * wx;
2706 for (
int dz = 0; dz < D1Dz; ++dz)
2708 const real_t wz = Bct(dz,qz);
2709 const real_t wDz = Gct(dz,qz);
2710 for (
int dy = 0; dy < D1Dy; ++dy)
2712 for (
int dx = 0; dx < D1Dx; ++dx)
2716 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
2717 e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
2723 osc += D1Dx * D1Dy * D1Dz;
2728 const int D1Dz = D1D - 1;
2729 const int D1Dy = D1D;
2730 const int D1Dx = D1D;
2732 for (
int qx = 0; qx < Q1D; ++qx)
2734 real_t gradYZ01[MD1D][MD1D];
2735 real_t gradYZ10[MD1D][MD1D];
2737 for (
int dy = 0; dy < D1Dy; ++dy)
2739 for (
int dz = 0; dz < D1Dz; ++dz)
2741 gradYZ01[dz][dy] = 0.0;
2742 gradYZ10[dz][dy] = 0.0;
2745 for (
int qy = 0; qy < Q1D; ++qy)
2748 for (
int dz = 0; dz < D1Dz; ++dz)
2750 for (
int n = 0; n < 2; ++n)
2755 for (
int qz = 0; qz < Q1D; ++qz)
2757 for (
int dz = 0; dz < D1Dz; ++dz)
2759 const real_t wz = Bot(dz,qz);
2761 massZ[dz][0] += wz * mass[qz][qy][qx][0];
2762 massZ[dz][1] += wz * mass[qz][qy][qx][1];
2765 for (
int dy = 0; dy < D1Dy; ++dy)
2767 const real_t wy = Bct(dy,qy);
2768 const real_t wDy = Gct(dy,qy);
2770 for (
int dz = 0; dz < D1Dz; ++dz)
2772 gradYZ01[dz][dy] += wy * massZ[dz][1];
2773 gradYZ10[dz][dy] += wDy * massZ[dz][0];
2778 for (
int dx = 0; dx < D1Dx; ++dx)
2780 const real_t wx = Bct(dx,qx);
2781 const real_t wDx = Gct(dx,qx);
2783 for (
int dy = 0; dy < D1Dy; ++dy)
2785 for (
int dz = 0; dz < D1Dz; ++dz)
2789 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
2790 e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
2800template<
int T_D1D = 0,
int T_Q1D = 0>
2801inline void SmemPAHcurlL2ApplyTranspose3D(
const int d1d,
2805 const Array<real_t> &bo,
2806 const Array<real_t> &bc,
2807 const Array<real_t> &gc,
2808 const Vector &pa_data,
2813 "Error: d1d > HCURL_MAX_D1D");
2815 "Error: q1d > HCURL_MAX_Q1D");
2816 const int D1D = T_D1D ? T_D1D : d1d;
2817 const int Q1D = T_Q1D ? T_Q1D : q1d;
2819 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
2820 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
2821 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
2822 auto op =
Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
2823 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
2824 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
2826 auto device_kernel = [=] MFEM_DEVICE (
int e)
2828 constexpr int VDIM = 3;
2829 constexpr int maxCoeffDim = 9;
2830 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
2831 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
2832 const int D1D = T_D1D ? T_D1D : d1d;
2833 const int Q1D = T_Q1D ? T_Q1D : q1d;
2835 MFEM_SHARED
real_t sBo[MD1D][MQ1D];
2836 MFEM_SHARED
real_t sBc[MD1D][MQ1D];
2837 MFEM_SHARED
real_t sGc[MD1D][MQ1D];
2840 MFEM_SHARED
real_t sop[maxCoeffDim][MQ1D][MQ1D];
2841 MFEM_SHARED
real_t mass[MQ1D][MQ1D][3];
2843 MFEM_SHARED
real_t sX[MD1D][MD1D][MD1D];
2845 MFEM_FOREACH_THREAD(qx,x,Q1D)
2847 MFEM_FOREACH_THREAD(qy,y,Q1D)
2849 MFEM_FOREACH_THREAD(qz,z,Q1D)
2851 for (
int i=0; i<coeffDim; ++i)
2853 opc[i] = op(i,qx,qy,qz,e);
2859 const int tidx = MFEM_THREAD_ID(x);
2860 const int tidy = MFEM_THREAD_ID(y);
2861 const int tidz = MFEM_THREAD_ID(z);
2865 MFEM_FOREACH_THREAD(d,y,D1D)
2867 MFEM_FOREACH_THREAD(q,x,Q1D)
2869 sBc[d][q] = Bc(q,d);
2870 sGc[d][q] = Gc(q,d);
2873 sBo[d][q] = Bo(q,d);
2880 for (
int qz=0; qz < Q1D; ++qz)
2884 MFEM_FOREACH_THREAD(qy,y,Q1D)
2886 MFEM_FOREACH_THREAD(qx,x,Q1D)
2888 for (
int i=0; i<3; ++i)
2890 mass[qy][qx][i] = 0.0;
2897 for (
int c = 0; c < VDIM; ++c)
2899 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2900 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2901 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2903 MFEM_FOREACH_THREAD(dz,z,D1Dz)
2905 MFEM_FOREACH_THREAD(dy,y,D1Dy)
2907 MFEM_FOREACH_THREAD(dx,x,D1Dx)
2909 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2919 for (
int i=0; i<coeffDim; ++i)
2921 sop[i][tidx][tidy] = opc[i];
2925 MFEM_FOREACH_THREAD(qy,y,Q1D)
2927 MFEM_FOREACH_THREAD(qx,x,Q1D)
2931 for (
int dz = 0; dz < D1Dz; ++dz)
2933 const real_t wz = (c == 2) ? sBo[dz][qz] : sBc[dz][qz];
2935 for (
int dy = 0; dy < D1Dy; ++dy)
2937 const real_t wy = (c == 1) ? sBo[dy][qy] : sBc[dy][qy];
2939 for (
int dx = 0; dx < D1Dx; ++dx)
2941 const real_t wx = sX[dz][dy][dx] * ((c == 0) ? sBo[dx][qx] : sBc[dx][qx]);
2947 mass[qy][qx][c] +=
u;
2952 osc += D1Dx * D1Dy * D1Dz;
2960 MFEM_FOREACH_THREAD(dz,z,D1D)
2962 const real_t wcz = sBc[dz][qz];
2963 const real_t wcDz = sGc[dz][qz];
2964 const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
2966 MFEM_FOREACH_THREAD(dy,y,D1D)
2968 MFEM_FOREACH_THREAD(dx,x,D1D)
2970 for (
int qy = 0; qy < Q1D; ++qy)
2972 const real_t wcy = sBc[dy][qy];
2973 const real_t wcDy = sGc[dy][qy];
2974 const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
2976 for (
int qx = 0; qx < Q1D; ++qx)
2978 const real_t O11 = sop[0][qx][qy];
2982 c1 = O11 * mass[qy][qx][0];
2983 c2 = O11 * mass[qy][qx][1];
2984 c3 = O11 * mass[qy][qx][2];
2988 const real_t O12 = sop[1][qx][qy];
2989 const real_t O13 = sop[2][qx][qy];
2990 const real_t O21 = sop[3][qx][qy];
2991 const real_t O22 = sop[4][qx][qy];
2992 const real_t O23 = sop[5][qx][qy];
2993 const real_t O31 = sop[6][qx][qy];
2994 const real_t O32 = sop[7][qx][qy];
2995 const real_t O33 = sop[8][qx][qy];
2997 c1 = (O11*mass[qy][qx][0])+(O12*mass[qy][qx][1])+(O13*mass[qy][qx][2]);
2998 c2 = (O21*mass[qy][qx][0])+(O22*mass[qy][qx][1])+(O23*mass[qy][qx][2]);
2999 c3 = (O31*mass[qy][qx][0])+(O32*mass[qy][qx][1])+(O33*mass[qy][qx][2]);
3002 const real_t wcx = sBc[dx][qx];
3003 const real_t wDx = sGc[dx][qx];
3007 const real_t wx = sBo[dx][qx];
3008 dxyz1 += (wx * c2 * wcy * wcDz) - (wx * c3 * wcDy * wcz);
3011 dxyz2 += (-wy * c1 * wcx * wcDz) + (wy * c3 * wDx * wcz);
3013 dxyz3 += (wcDy * wz * c1 * wcx) - (wcy * wz * c2 * wDx);
3022 MFEM_FOREACH_THREAD(dz,z,D1D)
3024 MFEM_FOREACH_THREAD(dy,y,D1D)
3026 MFEM_FOREACH_THREAD(dx,x,D1D)
3030 Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
3034 Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
3038 Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
3046 auto host_kernel = [&] MFEM_LAMBDA (
int)
3048 MFEM_ABORT_KERNEL(
"This kernel should only be used on GPU.");
3051 ForallWrap<3>(
true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
real_t u(const Vector &xvec)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
void forall(int N, lambda &&body)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.