12#ifndef MFEM_BILININTEG_HCURL_KERNELS_HPP
13#define MFEM_BILININTEG_HCURL_KERNELS_HPP
32void PAHcurlMassAssembleDiagonal2D(
const int D1D,
36 const Array<real_t> &bo,
37 const Array<real_t> &bc,
38 const Vector &pa_data,
42void PAHcurlMassAssembleDiagonal3D(
const int D1D,
46 const Array<real_t> &bo,
47 const Array<real_t> &bc,
48 const Vector &pa_data,
52template<
int T_D1D = 0,
int T_Q1D = 0>
53inline void SmemPAHcurlMassAssembleDiagonal3D(
const int d1d,
57 const Array<real_t> &bo,
58 const Array<real_t> &bc,
59 const Vector &pa_data,
63 "Error: d1d > HCURL_MAX_D1D");
65 "Error: q1d > HCURL_MAX_Q1D");
66 const int D1D = T_D1D ? T_D1D : d1d;
67 const int Q1D = T_Q1D ? T_Q1D : q1d;
69 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
70 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
71 auto op =
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
72 auto D =
Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
76 constexpr int VDIM = 3;
77 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
78 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
79 const int D1D = T_D1D ? T_D1D : d1d;
80 const int Q1D = T_Q1D ? T_Q1D : q1d;
82 MFEM_SHARED
real_t sBo[MQ1D][MD1D];
83 MFEM_SHARED
real_t sBc[MQ1D][MD1D];
86 MFEM_SHARED
real_t sop[3][MQ1D][MQ1D];
88 MFEM_FOREACH_THREAD(qx,x,Q1D)
90 MFEM_FOREACH_THREAD(qy,y,Q1D)
92 MFEM_FOREACH_THREAD(qz,z,Q1D)
94 op3[0] = op(qx,qy,qz,0,e);
95 op3[1] = op(qx,qy,qz,symmetric ? 3 : 4,e);
96 op3[2] = op(qx,qy,qz,symmetric ? 5 : 8,e);
101 const int tidx = MFEM_THREAD_ID(x);
102 const int tidy = MFEM_THREAD_ID(y);
103 const int tidz = MFEM_THREAD_ID(z);
107 MFEM_FOREACH_THREAD(d,y,D1D)
109 MFEM_FOREACH_THREAD(q,x,Q1D)
122 for (
int c = 0; c < VDIM; ++c)
124 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
125 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
126 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
130 for (
int qz=0; qz < Q1D; ++qz)
134 for (
int i=0; i<3; ++i)
136 sop[i][tidx][tidy] = op3[i];
142 MFEM_FOREACH_THREAD(dz,z,D1Dz)
144 const real_t wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
146 MFEM_FOREACH_THREAD(dy,y,D1Dy)
148 MFEM_FOREACH_THREAD(dx,x,D1Dx)
150 for (
int qy = 0; qy < Q1D; ++qy)
152 const real_t wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
154 for (
int qx = 0; qx < Q1D; ++qx)
156 const real_t wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
157 dxyz += sop[c][qx][qy] * wx * wx * wy * wy * wz * wz;
167 MFEM_FOREACH_THREAD(dz,z,D1Dz)
169 MFEM_FOREACH_THREAD(dy,y,D1Dy)
171 MFEM_FOREACH_THREAD(dx,x,D1Dx)
173 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
178 osc += D1Dx * D1Dy * D1Dz;
184void PAHcurlMassApply2D(
const int D1D,
187 const bool symmetric,
188 const Array<real_t> &bo,
189 const Array<real_t> &bc,
190 const Array<real_t> &bot,
191 const Array<real_t> &bct,
192 const Vector &pa_data,
197void PAHcurlMassApply3D(
const int D1D,
200 const bool symmetric,
201 const Array<real_t> &bo,
202 const Array<real_t> &bc,
203 const Array<real_t> &bot,
204 const Array<real_t> &bct,
205 const Vector &pa_data,
210template<
int T_D1D = 0,
int T_Q1D = 0>
211inline void SmemPAHcurlMassApply3D(
const int d1d,
214 const bool symmetric,
215 const Array<real_t> &bo,
216 const Array<real_t> &bc,
217 const Array<real_t> &bot,
218 const Array<real_t> &bct,
219 const Vector &pa_data,
224 "Error: d1d > HCURL_MAX_D1D");
226 "Error: q1d > HCURL_MAX_Q1D");
227 const int D1D = T_D1D ? T_D1D : d1d;
228 const int Q1D = T_Q1D ? T_Q1D : q1d;
230 const int dataSize = symmetric ? 6 : 9;
232 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
233 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
234 auto op =
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, dataSize, NE);
235 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
236 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
240 constexpr int VDIM = 3;
241 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
242 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
243 const int D1D = T_D1D ? T_D1D : d1d;
244 const int Q1D = T_Q1D ? T_Q1D : q1d;
246 MFEM_SHARED
real_t sBo[MQ1D][MD1D];
247 MFEM_SHARED
real_t sBc[MQ1D][MD1D];
250 MFEM_SHARED
real_t sop[9*MQ1D*MQ1D];
251 MFEM_SHARED
real_t mass[MQ1D][MQ1D][3];
253 MFEM_SHARED
real_t sX[MD1D][MD1D][MD1D];
255 MFEM_FOREACH_THREAD(qx,x,Q1D)
257 MFEM_FOREACH_THREAD(qy,y,Q1D)
259 MFEM_FOREACH_THREAD(qz,z,Q1D)
261 for (
int i=0; i<dataSize; ++i)
263 op9[i] = op(qx,qy,qz,i,e);
269 const int tidx = MFEM_THREAD_ID(x);
270 const int tidy = MFEM_THREAD_ID(y);
271 const int tidz = MFEM_THREAD_ID(z);
275 MFEM_FOREACH_THREAD(d,y,D1D)
277 MFEM_FOREACH_THREAD(q,x,Q1D)
289 for (
int qz=0; qz < Q1D; ++qz)
292 for (
int c = 0; c < VDIM; ++c)
294 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
295 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
296 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
298 MFEM_FOREACH_THREAD(dz,z,D1Dz)
300 MFEM_FOREACH_THREAD(dy,y,D1Dy)
302 MFEM_FOREACH_THREAD(dx,x,D1Dx)
304 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
312 for (
int i=0; i<dataSize; ++i)
314 sop[i + (dataSize*tidx) + (dataSize*Q1D*tidy)] = op9[i];
317 MFEM_FOREACH_THREAD(qy,y,Q1D)
319 MFEM_FOREACH_THREAD(qx,x,Q1D)
323 for (
int dz = 0; dz < D1Dz; ++dz)
325 const real_t wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
326 for (
int dy = 0; dy < D1Dy; ++dy)
328 const real_t wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
329 for (
int dx = 0; dx < D1Dx; ++dx)
331 const real_t t = sX[dz][dy][dx];
332 const real_t wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
333 u += t * wx * wy * wz;
343 osc += D1Dx * D1Dy * D1Dz;
350 for (
int c = 0; c < VDIM; ++c)
352 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
353 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
354 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
358 MFEM_FOREACH_THREAD(dz,z,D1Dz)
360 const real_t wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
362 MFEM_FOREACH_THREAD(dy,y,D1Dy)
364 MFEM_FOREACH_THREAD(dx,x,D1Dx)
366 for (
int qy = 0; qy < Q1D; ++qy)
368 const real_t wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
369 for (
int qx = 0; qx < Q1D; ++qx)
371 const int os = (dataSize*qx) + (dataSize*Q1D*qy);
372 const int id1 = os + ((c == 0) ? 0 : ((c == 1) ? (symmetric ? 1 : 3) :
373 (symmetric ? 2 : 6)));
374 const int id2 = os + ((c == 0) ? 1 : ((c == 1) ? (symmetric ? 3 : 4) :
375 (symmetric ? 4 : 7)));
376 const int id3 = os + ((c == 0) ? 2 : ((c == 1) ? (symmetric ? 4 : 5) :
377 (symmetric ? 5 : 8)));
379 const real_t m_c = (sop[id1] * mass[qy][qx][0]) + (sop[id2] * mass[qy][qx][1]) +
380 (sop[id3] * mass[qy][qx][2]);
382 const real_t wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
383 dxyz += m_c * wx * wy * wz;
392 MFEM_FOREACH_THREAD(dz,z,D1Dz)
394 MFEM_FOREACH_THREAD(dy,y,D1Dy)
396 MFEM_FOREACH_THREAD(dx,x,D1Dx)
398 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
403 osc += D1Dx * D1Dy * D1Dz;
410void PACurlCurlSetup2D(
const int Q1D,
412 const Array<real_t> &w,
418void PACurlCurlSetup3D(
const int Q1D,
421 const Array<real_t> &w,
427void PACurlCurlAssembleDiagonal2D(
const int D1D,
429 const bool symmetric,
431 const Array<real_t> &bo,
432 const Array<real_t> &bc,
433 const Array<real_t> &go,
434 const Array<real_t> &gc,
435 const Vector &pa_data,
439template<
int T_D1D = 0,
int T_Q1D = 0>
440inline void PACurlCurlAssembleDiagonal3D(
const int d1d,
442 const bool symmetric,
444 const Array<real_t> &bo,
445 const Array<real_t> &bc,
446 const Array<real_t> &go,
447 const Array<real_t> &gc,
448 const Vector &pa_data,
452 "Error: d1d > HCURL_MAX_D1D");
454 "Error: q1d > HCURL_MAX_Q1D");
455 const int D1D = T_D1D ? T_D1D : d1d;
456 const int Q1D = T_Q1D ? T_Q1D : q1d;
458 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
459 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
460 auto Go =
Reshape(go.Read(), Q1D, D1D-1);
461 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
462 auto op =
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
463 auto D =
Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
465 const int s = symmetric ? 6 : 9;
469 const int i21 = symmetric ? i12 : 3;
470 const int i22 = symmetric ? 3 : 4;
471 const int i23 = symmetric ? 4 : 5;
472 const int i31 = symmetric ? i13 : 6;
473 const int i32 = symmetric ? i23 : 7;
474 const int i33 = symmetric ? 5 : 8;
487 constexpr int VDIM = 3;
488 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
489 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
490 const int D1D = T_D1D ? T_D1D : d1d;
491 const int Q1D = T_Q1D ? T_Q1D : q1d;
495 for (
int c = 0; c < VDIM; ++c)
497 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
498 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
499 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
501 real_t zt[MQ1D][MQ1D][MD1D][9][3];
504 for (
int qx = 0; qx < Q1D; ++qx)
506 for (
int qy = 0; qy < Q1D; ++qy)
508 for (
int dz = 0; dz < D1Dz; ++dz)
510 for (
int i=0; i<s; ++i)
512 for (
int d=0; d<3; ++d)
514 zt[qx][qy][dz][i][d] = 0.0;
518 for (
int qz = 0; qz < Q1D; ++qz)
520 const real_t wz = ((c == 2) ? Bo(qz,dz) : Bc(qz,dz));
521 const real_t wDz = ((c == 2) ? Go(qz,dz) : Gc(qz,dz));
523 for (
int i=0; i<s; ++i)
525 zt[qx][qy][dz][i][0] += wz * wz * op(qx,qy,qz,i,e);
526 zt[qx][qy][dz][i][1] += wDz * wz * op(qx,qy,qz,i,e);
527 zt[qx][qy][dz][i][2] += wDz * wDz * op(qx,qy,qz,i,e);
534 real_t yt[MQ1D][MD1D][MD1D][9][3][3];
537 for (
int qx = 0; qx < Q1D; ++qx)
539 for (
int dz = 0; dz < D1Dz; ++dz)
541 for (
int dy = 0; dy < D1Dy; ++dy)
543 for (
int i=0; i<s; ++i)
545 for (
int d=0; d<3; ++d)
546 for (
int j=0; j<3; ++j)
548 yt[qx][dy][dz][i][d][j] = 0.0;
552 for (
int qy = 0; qy < Q1D; ++qy)
554 const real_t wy = ((c == 1) ? Bo(qy,dy) : Bc(qy,dy));
555 const real_t wDy = ((c == 1) ? Go(qy,dy) : Gc(qy,dy));
557 for (
int i=0; i<s; ++i)
559 for (
int d=0; d<3; ++d)
561 yt[qx][dy][dz][i][d][0] += wy * wy * zt[qx][qy][dz][i][d];
562 yt[qx][dy][dz][i][d][1] += wDy * wy * zt[qx][qy][dz][i][d];
563 yt[qx][dy][dz][i][d][2] += wDy * wDy * zt[qx][qy][dz][i][d];
572 for (
int dz = 0; dz < D1Dz; ++dz)
574 for (
int dy = 0; dy < D1Dy; ++dy)
576 for (
int dx = 0; dx < D1Dx; ++dx)
578 for (
int qx = 0; qx < Q1D; ++qx)
580 const real_t wx = ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
581 const real_t wDx = ((c == 0) ? Go(qx,dx) : Gc(qx,dx));
601 const real_t sumy = yt[qx][dy][dz][i22][2][0] - yt[qx][dy][dz][i23][1][1]
602 - yt[qx][dy][dz][i32][1][1] + yt[qx][dy][dz][i33][0][2];
604 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += sumy * wx * wx;
609 const real_t d = (yt[qx][dy][dz][i11][2][0] * wx * wx)
610 - ((yt[qx][dy][dz][i13][1][0] + yt[qx][dy][dz][i31][1][0]) * wDx * wx)
611 + (yt[qx][dy][dz][i33][0][0] * wDx * wDx);
613 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
618 const real_t d = (yt[qx][dy][dz][i11][0][2] * wx * wx)
619 - ((yt[qx][dy][dz][i12][0][1] + yt[qx][dy][dz][i21][0][1]) * wDx * wx)
620 + (yt[qx][dy][dz][i22][0][0] * wDx * wDx);
622 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
629 osc += D1Dx * D1Dy * D1Dz;
635template<
int T_D1D = 0,
int T_Q1D = 0>
636inline void SmemPACurlCurlAssembleDiagonal3D(
const int d1d,
638 const bool symmetric,
640 const Array<real_t> &bo,
641 const Array<real_t> &bc,
642 const Array<real_t> &go,
643 const Array<real_t> &gc,
644 const Vector &pa_data,
648 "Error: d1d > HCURL_MAX_D1D");
650 "Error: q1d > HCURL_MAX_Q1D");
651 const int D1D = T_D1D ? T_D1D : d1d;
652 const int Q1D = T_Q1D ? T_Q1D : q1d;
654 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
655 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
656 auto Go =
Reshape(go.Read(), Q1D, D1D-1);
657 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
658 auto op =
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
659 auto D =
Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
661 const int s = symmetric ? 6 : 9;
665 const int i21 = symmetric ? i12 : 3;
666 const int i22 = symmetric ? 3 : 4;
667 const int i23 = symmetric ? 4 : 5;
668 const int i31 = symmetric ? i13 : 6;
669 const int i32 = symmetric ? i23 : 7;
670 const int i33 = symmetric ? 5 : 8;
680 constexpr int VDIM = 3;
681 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
682 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
683 const int D1D = T_D1D ? T_D1D : d1d;
684 const int Q1D = T_Q1D ? T_Q1D : q1d;
686 MFEM_SHARED
real_t sBo[MQ1D][MD1D];
687 MFEM_SHARED
real_t sBc[MQ1D][MD1D];
688 MFEM_SHARED
real_t sGo[MQ1D][MD1D];
689 MFEM_SHARED
real_t sGc[MQ1D][MD1D];
692 MFEM_SHARED
real_t sop[9][MQ1D][MQ1D];
694 MFEM_FOREACH_THREAD(qx,x,Q1D)
696 MFEM_FOREACH_THREAD(qy,y,Q1D)
698 MFEM_FOREACH_THREAD(qz,z,Q1D)
700 for (
int i=0; i<s; ++i)
702 ope[i] = op(qx,qy,qz,i,e);
708 const int tidx = MFEM_THREAD_ID(x);
709 const int tidy = MFEM_THREAD_ID(y);
710 const int tidz = MFEM_THREAD_ID(z);
714 MFEM_FOREACH_THREAD(d,y,D1D)
716 MFEM_FOREACH_THREAD(q,x,Q1D)
731 for (
int c = 0; c < VDIM; ++c)
733 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
734 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
735 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
739 for (
int qz=0; qz < Q1D; ++qz)
743 for (
int i=0; i<s; ++i)
745 sop[i][tidx][tidy] = ope[i];
751 MFEM_FOREACH_THREAD(dz,z,D1Dz)
753 const real_t wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
754 const real_t wDz = ((c == 2) ? sGo[qz][dz] : sGc[qz][dz]);
756 MFEM_FOREACH_THREAD(dy,y,D1Dy)
758 MFEM_FOREACH_THREAD(dx,x,D1Dx)
760 for (
int qy = 0; qy < Q1D; ++qy)
762 const real_t wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
763 const real_t wDy = ((c == 1) ? sGo[qy][dy] : sGc[qy][dy]);
765 for (
int qx = 0; qx < Q1D; ++qx)
767 const real_t wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
768 const real_t wDx = ((c == 0) ? sGo[qx][dx] : sGc[qx][dx]);
775 dxyz += sop[i22][qx][qy] * wx * wx * wy * wy * wDz * wDz;
778 dxyz += -(sop[i23][qx][qy] + sop[i32][qx][qy]) * wx * wx * wDy * wy * wDz * wz;
781 dxyz += sop[i33][qx][qy] * wx * wx * wDy * wDy * wz * wz;
788 dxyz += sop[i11][qx][qy] * wx * wx * wy * wy * wDz * wDz;
791 dxyz += -(sop[i13][qx][qy] + sop[i31][qx][qy]) * wDx * wx * wy * wy * wDz * wz;
794 dxyz += sop[i33][qx][qy] * wDx * wDx * wy * wy * wz * wz;
801 dxyz += sop[i11][qx][qy] * wx * wx * wDy * wDy * wz * wz;
804 dxyz += -(sop[i12][qx][qy] + sop[i21][qx][qy]) * wDx * wx * wDy * wy * wz * wz;
807 dxyz += sop[i22][qx][qy] * wDx * wDx * wy * wy * wz * wz;
818 MFEM_FOREACH_THREAD(dz,z,D1Dz)
820 MFEM_FOREACH_THREAD(dy,y,D1Dy)
822 MFEM_FOREACH_THREAD(dx,x,D1Dx)
824 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
829 osc += D1Dx * D1Dy * D1Dz;
835void PACurlCurlApply2D(
const int D1D,
837 const bool symmetric,
839 const Array<real_t> &bo,
840 const Array<real_t> &bc,
841 const Array<real_t> &bot,
842 const Array<real_t> &bct,
843 const Array<real_t> &gc,
844 const Array<real_t> &gct,
845 const Vector &pa_data,
848 const bool useAbs =
false);
851template<
int T_D1D = 0,
int T_Q1D = 0>
852inline void PACurlCurlApply3D(
const int d1d,
854 const bool symmetric,
856 const Array<real_t> &bo,
857 const Array<real_t> &bc,
858 const Array<real_t> &bot,
859 const Array<real_t> &bct,
860 const Array<real_t> &gc,
861 const Array<real_t> &gct,
862 const Vector &pa_data,
865 const bool useAbs =
false)
868 "Error: d1d > HCURL_MAX_D1D");
870 "Error: q1d > HCURL_MAX_Q1D");
871 const int D1D = T_D1D ? T_D1D : d1d;
872 const int Q1D = T_Q1D ? T_Q1D : q1d;
874 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
875 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
876 auto Bot =
Reshape(bot.Read(), D1D-1, Q1D);
877 auto Bct =
Reshape(bct.Read(), D1D, Q1D);
878 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
879 auto Gct =
Reshape(gct.Read(), D1D, Q1D);
880 auto op =
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
881 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
882 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
894 constexpr int VDIM = 3;
895 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
896 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
897 const int D1D = T_D1D ? T_D1D : d1d;
898 const int Q1D = T_Q1D ? T_Q1D : q1d;
900 real_t curl[MQ1D][MQ1D][MQ1D][VDIM];
903 for (
int qz = 0; qz < Q1D; ++qz)
905 for (
int qy = 0; qy < Q1D; ++qy)
907 for (
int qx = 0; qx < Q1D; ++qx)
909 for (
int c = 0; c < VDIM; ++c)
911 curl[qz][qy][qx][c] = 0.0;
923 const int D1Dz = D1D;
924 const int D1Dy = D1D;
925 const int D1Dx = D1D - 1;
927 for (
int dz = 0; dz < D1Dz; ++dz)
929 real_t gradXY[MQ1D][MQ1D][2];
930 for (
int qy = 0; qy < Q1D; ++qy)
932 for (
int qx = 0; qx < Q1D; ++qx)
934 for (
int d = 0; d < 2; ++d)
936 gradXY[qy][qx][d] = 0.0;
941 for (
int dy = 0; dy < D1Dy; ++dy)
944 for (
int qx = 0; qx < Q1D; ++qx)
949 for (
int dx = 0; dx < D1Dx; ++dx)
951 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
952 for (
int qx = 0; qx < Q1D; ++qx)
954 massX[qx] += t * Bo(qx,dx);
958 for (
int qy = 0; qy < Q1D; ++qy)
960 const real_t wy = Bc(qy,dy);
961 const real_t wDy = Gc(qy,dy);
962 for (
int qx = 0; qx < Q1D; ++qx)
964 const real_t wx = massX[qx];
965 gradXY[qy][qx][0] += wx * wDy;
966 gradXY[qy][qx][1] += wx * wy;
971 for (
int qz = 0; qz < Q1D; ++qz)
973 const real_t wz = Bc(qz,dz);
974 const real_t wDz = Gc(qz,dz);
975 for (
int qy = 0; qy < Q1D; ++qy)
977 for (
int qx = 0; qx < Q1D; ++qx)
980 curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz;
984 curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;
989 curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;
996 osc += D1Dx * D1Dy * D1Dz;
1001 const int D1Dz = D1D;
1002 const int D1Dy = D1D - 1;
1003 const int D1Dx = D1D;
1005 for (
int dz = 0; dz < D1Dz; ++dz)
1007 real_t gradXY[MQ1D][MQ1D][2];
1008 for (
int qy = 0; qy < Q1D; ++qy)
1010 for (
int qx = 0; qx < Q1D; ++qx)
1012 for (
int d = 0; d < 2; ++d)
1014 gradXY[qy][qx][d] = 0.0;
1019 for (
int dx = 0; dx < D1Dx; ++dx)
1022 for (
int qy = 0; qy < Q1D; ++qy)
1027 for (
int dy = 0; dy < D1Dy; ++dy)
1029 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1030 for (
int qy = 0; qy < Q1D; ++qy)
1032 massY[qy] += t * Bo(qy,dy);
1036 for (
int qx = 0; qx < Q1D; ++qx)
1038 const real_t wx = Bc(qx,dx);
1039 const real_t wDx = Gc(qx,dx);
1040 for (
int qy = 0; qy < Q1D; ++qy)
1042 const real_t wy = massY[qy];
1043 gradXY[qy][qx][0] += wDx * wy;
1044 gradXY[qy][qx][1] += wx * wy;
1049 for (
int qz = 0; qz < Q1D; ++qz)
1051 const real_t wz = Bc(qz,dz);
1052 const real_t wDz = Gc(qz,dz);
1053 for (
int qy = 0; qy < Q1D; ++qy)
1055 for (
int qx = 0; qx < Q1D; ++qx)
1061 curl[qz][qy][qx][0] += gradXY[qy][qx][1] * wDz;
1066 curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz;
1068 curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;
1074 osc += D1Dx * D1Dy * D1Dz;
1079 const int D1Dz = D1D - 1;
1080 const int D1Dy = D1D;
1081 const int D1Dx = D1D;
1083 for (
int dx = 0; dx < D1Dx; ++dx)
1085 real_t gradYZ[MQ1D][MQ1D][2];
1086 for (
int qz = 0; qz < Q1D; ++qz)
1088 for (
int qy = 0; qy < Q1D; ++qy)
1090 for (
int d = 0; d < 2; ++d)
1092 gradYZ[qz][qy][d] = 0.0;
1097 for (
int dy = 0; dy < D1Dy; ++dy)
1100 for (
int qz = 0; qz < Q1D; ++qz)
1105 for (
int dz = 0; dz < D1Dz; ++dz)
1107 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1108 for (
int qz = 0; qz < Q1D; ++qz)
1110 massZ[qz] += t * Bo(qz,dz);
1114 for (
int qy = 0; qy < Q1D; ++qy)
1116 const real_t wy = Bc(qy,dy);
1117 const real_t wDy = Gc(qy,dy);
1118 for (
int qz = 0; qz < Q1D; ++qz)
1120 const real_t wz = massZ[qz];
1121 gradYZ[qz][qy][0] += wz * wy;
1122 gradYZ[qz][qy][1] += wz * wDy;
1127 for (
int qx = 0; qx < Q1D; ++qx)
1129 const real_t wx = Bc(qx,dx);
1130 const real_t wDx = Gc(qx,dx);
1132 for (
int qy = 0; qy < Q1D; ++qy)
1134 for (
int qz = 0; qz < Q1D; ++qz)
1137 curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx;
1141 curl[qz][qy][qx][1] += gradYZ[qz][qy][0] * wDx;
1146 curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx;
1155 for (
int qz = 0; qz < Q1D; ++qz)
1157 for (
int qy = 0; qy < Q1D; ++qy)
1159 for (
int qx = 0; qx < Q1D; ++qx)
1161 const real_t O11 = op(qx,qy,qz,0,e);
1162 const real_t O12 = op(qx,qy,qz,1,e);
1163 const real_t O13 = op(qx,qy,qz,2,e);
1164 const real_t O21 = symmetric ? O12 : op(qx,qy,qz,3,e);
1165 const real_t O22 = symmetric ? op(qx,qy,qz,3,e) : op(qx,qy,qz,4,e);
1166 const real_t O23 = symmetric ? op(qx,qy,qz,4,e) : op(qx,qy,qz,5,e);
1167 const real_t O31 = symmetric ? O13 : op(qx,qy,qz,6,e);
1168 const real_t O32 = symmetric ? O23 : op(qx,qy,qz,7,e);
1169 const real_t O33 = symmetric ? op(qx,qy,qz,5,e) : op(qx,qy,qz,8,e);
1171 const real_t c1 = (O11 * curl[qz][qy][qx][0]) + (O12 * curl[qz][qy][qx][1]) +
1172 (O13 * curl[qz][qy][qx][2]);
1173 const real_t c2 = (O21 * curl[qz][qy][qx][0]) + (O22 * curl[qz][qy][qx][1]) +
1174 (O23 * curl[qz][qy][qx][2]);
1175 const real_t c3 = (O31 * curl[qz][qy][qx][0]) + (O32 * curl[qz][qy][qx][1]) +
1176 (O33 * curl[qz][qy][qx][2]);
1178 curl[qz][qy][qx][0] = c1;
1179 curl[qz][qy][qx][1] = c2;
1180 curl[qz][qy][qx][2] = c3;
1188 const int D1Dz = D1D;
1189 const int D1Dy = D1D;
1190 const int D1Dx = D1D - 1;
1192 for (
int qz = 0; qz < Q1D; ++qz)
1194 real_t gradXY12[MD1D][MD1D];
1195 real_t gradXY21[MD1D][MD1D];
1197 for (
int dy = 0; dy < D1Dy; ++dy)
1199 for (
int dx = 0; dx < D1Dx; ++dx)
1201 gradXY12[dy][dx] = 0.0;
1202 gradXY21[dy][dx] = 0.0;
1205 for (
int qy = 0; qy < Q1D; ++qy)
1208 for (
int dx = 0; dx < D1Dx; ++dx)
1210 for (
int n = 0; n < 2; ++n)
1215 for (
int qx = 0; qx < Q1D; ++qx)
1217 for (
int dx = 0; dx < D1Dx; ++dx)
1219 const real_t wx = Bot(dx,qx);
1221 massX[dx][0] += wx * curl[qz][qy][qx][1];
1222 massX[dx][1] += wx * curl[qz][qy][qx][2];
1225 for (
int dy = 0; dy < D1Dy; ++dy)
1227 const real_t wy = Bct(dy,qy);
1228 const real_t wDy = Gct(dy,qy);
1230 for (
int dx = 0; dx < D1Dx; ++dx)
1232 gradXY21[dy][dx] += massX[dx][0] * wy;
1233 gradXY12[dy][dx] += massX[dx][1] * wDy;
1238 for (
int dz = 0; dz < D1Dz; ++dz)
1240 const real_t wz = Bct(dz,qz);
1241 const real_t wDz = Gct(dz,qz);
1242 for (
int dy = 0; dy < D1Dy; ++dy)
1244 for (
int dx = 0; dx < D1Dx; ++dx)
1247 const int idx = dx + ((dy + (dz * D1Dy)) * D1Dx) + osc;
1252 Y(idx, e) += (gradXY21[dy][dx] * wDz) +
1253 (gradXY12[dy][dx] * wz);
1259 Y(idx, e) += (gradXY21[dy][dx] * wDz) -
1260 (gradXY12[dy][dx] * wz);
1267 osc += D1Dx * D1Dy * D1Dz;
1272 const int D1Dz = D1D;
1273 const int D1Dy = D1D - 1;
1274 const int D1Dx = D1D;
1276 for (
int qz = 0; qz < Q1D; ++qz)
1278 real_t gradXY02[MD1D][MD1D];
1279 real_t gradXY20[MD1D][MD1D];
1281 for (
int dy = 0; dy < D1Dy; ++dy)
1283 for (
int dx = 0; dx < D1Dx; ++dx)
1285 gradXY02[dy][dx] = 0.0;
1286 gradXY20[dy][dx] = 0.0;
1289 for (
int qx = 0; qx < Q1D; ++qx)
1292 for (
int dy = 0; dy < D1Dy; ++dy)
1297 for (
int qy = 0; qy < Q1D; ++qy)
1299 for (
int dy = 0; dy < D1Dy; ++dy)
1301 const real_t wy = Bot(dy,qy);
1303 massY[dy][0] += wy * curl[qz][qy][qx][2];
1304 massY[dy][1] += wy * curl[qz][qy][qx][0];
1307 for (
int dx = 0; dx < D1Dx; ++dx)
1309 const real_t wx = Bct(dx,qx);
1310 const real_t wDx = Gct(dx,qx);
1312 for (
int dy = 0; dy < D1Dy; ++dy)
1314 gradXY02[dy][dx] += massY[dy][0] * wDx;
1315 gradXY20[dy][dx] += massY[dy][1] * wx;
1320 for (
int dz = 0; dz < D1Dz; ++dz)
1322 const real_t wz = Bct(dz,qz);
1323 const real_t wDz = Gct(dz,qz);
1324 for (
int dy = 0; dy < D1Dy; ++dy)
1326 for (
int dx = 0; dx < D1Dx; ++dx)
1328 const int idx = dx + ((dy + (dz * D1Dy)) * D1Dx) + osc;
1334 Y(idx, e) += (gradXY20[dy][dx] * wDz) +
1335 (gradXY02[dy][dx] * wz);
1341 Y(idx, e) += (-gradXY20[dy][dx] * wDz) +
1342 (gradXY02[dy][dx] * wz);
1349 osc += D1Dx * D1Dy * D1Dz;
1354 const int D1Dz = D1D - 1;
1355 const int D1Dy = D1D;
1356 const int D1Dx = D1D;
1358 for (
int qx = 0; qx < Q1D; ++qx)
1360 real_t gradYZ01[MD1D][MD1D];
1361 real_t gradYZ10[MD1D][MD1D];
1363 for (
int dy = 0; dy < D1Dy; ++dy)
1365 for (
int dz = 0; dz < D1Dz; ++dz)
1367 gradYZ01[dz][dy] = 0.0;
1368 gradYZ10[dz][dy] = 0.0;
1371 for (
int qy = 0; qy < Q1D; ++qy)
1374 for (
int dz = 0; dz < D1Dz; ++dz)
1376 for (
int n = 0; n < 2; ++n)
1381 for (
int qz = 0; qz < Q1D; ++qz)
1383 for (
int dz = 0; dz < D1Dz; ++dz)
1385 const real_t wz = Bot(dz,qz);
1387 massZ[dz][0] += wz * curl[qz][qy][qx][0];
1388 massZ[dz][1] += wz * curl[qz][qy][qx][1];
1391 for (
int dy = 0; dy < D1Dy; ++dy)
1393 const real_t wy = Bct(dy,qy);
1394 const real_t wDy = Gct(dy,qy);
1396 for (
int dz = 0; dz < D1Dz; ++dz)
1398 gradYZ01[dz][dy] += wy * massZ[dz][1];
1399 gradYZ10[dz][dy] += wDy * massZ[dz][0];
1404 for (
int dx = 0; dx < D1Dx; ++dx)
1406 const real_t wx = Bct(dx,qx);
1407 const real_t wDx = Gct(dx,qx);
1409 for (
int dy = 0; dy < D1Dy; ++dy)
1411 for (
int dz = 0; dz < D1Dz; ++dz)
1413 const int idx = dx + ((dy + (dz * D1Dy)) * D1Dx) + osc;
1419 Y(idx, e) += (gradYZ10[dz][dy] * wx) +
1420 (gradYZ01[dz][dy] * wDx);
1426 Y(idx, e) += (gradYZ10[dz][dy] * wx) -
1427 (gradYZ01[dz][dy] * wDx);
1438template<
int T_D1D = 0,
int T_Q1D = 0>
1439inline void SmemPACurlCurlApply3D(
const int d1d,
1441 const bool symmetric,
1443 const Array<real_t> &bo,
1444 const Array<real_t> &bc,
1445 const Array<real_t> &bot,
1446 const Array<real_t> &bct,
1447 const Array<real_t> &gc,
1448 const Array<real_t> &gct,
1449 const Vector &pa_data,
1452 const bool useAbs =
false)
1455 "Error: d1d > HCURL_MAX_D1D");
1457 "Error: q1d > HCURL_MAX_Q1D");
1458 const int D1D = T_D1D ? T_D1D : d1d;
1459 const int Q1D = T_Q1D ? T_Q1D : q1d;
1467 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
1468 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
1469 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
1470 auto op =
Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
1471 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
1472 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
1474 const int s = symmetric ? 6 : 9;
1476 auto device_kernel = [=] MFEM_DEVICE (
int e)
1478 constexpr int VDIM = 3;
1479 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
1480 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
1481 const int D1D = T_D1D ? T_D1D : d1d;
1482 const int Q1D = T_Q1D ? T_Q1D : q1d;
1484 MFEM_SHARED
real_t sBo[MD1D][MQ1D];
1485 MFEM_SHARED
real_t sBc[MD1D][MQ1D];
1486 MFEM_SHARED
real_t sGc[MD1D][MQ1D];
1489 MFEM_SHARED
real_t sop[9][MQ1D][MQ1D];
1490 MFEM_SHARED
real_t curl[MQ1D][MQ1D][3];
1492 MFEM_SHARED
real_t sX[MD1D][MD1D][MD1D];
1494 MFEM_FOREACH_THREAD(qx,x,Q1D)
1496 MFEM_FOREACH_THREAD(qy,y,Q1D)
1498 MFEM_FOREACH_THREAD(qz,z,Q1D)
1500 for (
int i=0; i<s; ++i)
1502 ope[i] = op(qx,qy,qz,i,e);
1508 const int tidx = MFEM_THREAD_ID(x);
1509 const int tidy = MFEM_THREAD_ID(y);
1510 const int tidz = MFEM_THREAD_ID(z);
1514 MFEM_FOREACH_THREAD(d,y,D1D)
1516 MFEM_FOREACH_THREAD(q,x,Q1D)
1518 sBc[d][q] = Bc(q,d);
1519 sGc[d][q] = Gc(q,d);
1522 sBo[d][q] = Bo(q,d);
1529 for (
int qz=0; qz < Q1D; ++qz)
1533 MFEM_FOREACH_THREAD(qy,y,Q1D)
1535 MFEM_FOREACH_THREAD(qx,x,Q1D)
1537 for (
int i=0; i<3; ++i)
1539 curl[qy][qx][i] = 0.0;
1546 for (
int c = 0; c < VDIM; ++c)
1548 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
1549 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
1550 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
1552 MFEM_FOREACH_THREAD(dz,z,D1Dz)
1554 MFEM_FOREACH_THREAD(dy,y,D1Dy)
1556 MFEM_FOREACH_THREAD(dx,x,D1Dx)
1558 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1568 for (
int i=0; i<s; ++i)
1570 sop[i][tidx][tidy] = ope[i];
1574 MFEM_FOREACH_THREAD(qy,y,Q1D)
1576 MFEM_FOREACH_THREAD(qx,x,Q1D)
1586 for (
int dz = 0; dz < D1Dz; ++dz)
1588 const real_t wz = sBc[dz][qz];
1589 const real_t wDz = sGc[dz][qz];
1591 for (
int dy = 0; dy < D1Dy; ++dy)
1593 const real_t wy = sBc[dy][qy];
1594 const real_t wDy = sGc[dy][qy];
1596 for (
int dx = 0; dx < D1Dx; ++dx)
1598 const real_t wx = sX[dz][dy][dx] * sBo[dx][qx];
1605 curl[qy][qx][1] += v;
1606 if (useAbs) { curl[qy][qx][2] +=
u; }
1607 else { curl[qy][qx][2] -=
u; }
1613 for (
int dz = 0; dz < D1Dz; ++dz)
1615 const real_t wz = sBc[dz][qz];
1616 const real_t wDz = sGc[dz][qz];
1618 for (
int dy = 0; dy < D1Dy; ++dy)
1620 const real_t wy = sBo[dy][qy];
1622 for (
int dx = 0; dx < D1Dx; ++dx)
1624 const real_t t = sX[dz][dy][dx];
1625 const real_t wx = t * sBc[dx][qx];
1626 const real_t wDx = t * sGc[dx][qx];
1634 if (useAbs) { curl[qy][qx][0] += v; }
1635 else { curl[qy][qx][0] -= v; }
1636 curl[qy][qx][2] +=
u;
1642 for (
int dz = 0; dz < D1Dz; ++dz)
1644 const real_t wz = sBo[dz][qz];
1646 for (
int dy = 0; dy < D1Dy; ++dy)
1648 const real_t wy = sBc[dy][qy];
1649 const real_t wDy = sGc[dy][qy];
1651 for (
int dx = 0; dx < D1Dx; ++dx)
1653 const real_t t = sX[dz][dy][dx];
1654 const real_t wx = t * sBc[dx][qx];
1655 const real_t wDx = t * sGc[dx][qx];
1663 curl[qy][qx][0] += v;
1664 if (useAbs) { curl[qy][qx][1] +=
u; }
1665 else { curl[qy][qx][1] -=
u; }
1671 osc += D1Dx * D1Dy * D1Dz;
1679 MFEM_FOREACH_THREAD(dz,z,D1D)
1681 const real_t wcz = sBc[dz][qz];
1682 const real_t wcDz = sGc[dz][qz];
1683 const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
1685 MFEM_FOREACH_THREAD(dy,y,D1D)
1687 MFEM_FOREACH_THREAD(dx,x,D1D)
1689 for (
int qy = 0; qy < Q1D; ++qy)
1691 const real_t wcy = sBc[dy][qy];
1692 const real_t wcDy = sGc[dy][qy];
1693 const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
1695 for (
int qx = 0; qx < Q1D; ++qx)
1697 const real_t O11 = sop[0][qx][qy];
1698 const real_t O12 = sop[1][qx][qy];
1699 const real_t O13 = sop[2][qx][qy];
1700 const real_t O21 = symmetric ? O12 : sop[3][qx][qy];
1701 const real_t O22 = symmetric ? sop[3][qx][qy] : sop[4][qx][qy];
1702 const real_t O23 = symmetric ? sop[4][qx][qy] : sop[5][qx][qy];
1703 const real_t O31 = symmetric ? O13 : sop[6][qx][qy];
1704 const real_t O32 = symmetric ? O23 : sop[7][qx][qy];
1705 const real_t O33 = symmetric ? sop[5][qx][qy] : sop[8][qx][qy];
1707 const real_t c1 = (O11 * curl[qy][qx][0]) + (O12 * curl[qy][qx][1]) +
1708 (O13 * curl[qy][qx][2]);
1709 const real_t c2 = (O21 * curl[qy][qx][0]) + (O22 * curl[qy][qx][1]) +
1710 (O23 * curl[qy][qx][2]);
1711 const real_t c3 = (O31 * curl[qy][qx][0]) + (O32 * curl[qy][qx][1]) +
1712 (O33 * curl[qy][qx][2]);
1714 const real_t wcx = sBc[dx][qx];
1715 const real_t wDx = sGc[dx][qx];
1720 const real_t wx = sBo[dx][qx];
1725 dxyz1 += (wx * c2 * wcy * wcDz) +
1726 (wx * c3 * wcDy * wcz);
1732 dxyz1 += (wx * c2 * wcy * wcDz) -
1733 (wx * c3 * wcDy * wcz);
1742 dxyz2 += (wy * c1 * wcx * wcDz) +
1743 (wy * c3 * wDx * wcz);
1749 dxyz2 += (-wy * c1 * wcx * wcDz) +
1750 (wy * c3 * wDx * wcz);
1758 dxyz3 += (wcDy * wz * c1 * wcx) +
1759 (wcy * wz * c2 * wDx);
1765 dxyz3 += (wcDy * wz * c1 * wcx) -
1766 (wcy * wz * c2 * wDx);
1776 MFEM_FOREACH_THREAD(dz,z,D1D)
1778 MFEM_FOREACH_THREAD(dy,y,D1D)
1780 MFEM_FOREACH_THREAD(dx,x,D1D)
1784 Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
1788 Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
1792 Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
1800 auto host_kernel = [&] MFEM_LAMBDA (
int)
1802 MFEM_ABORT_KERNEL(
"This kernel should only be used on GPU.");
1805 ForallWrap<3>(
true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
1809void PAHcurlL2Setup2D(
const int Q1D,
1811 const Array<real_t> &w,
1816void PAHcurlL2Setup3D(
const int NQ,
1819 const Array<real_t> &w,
1824void PAHcurlL2Apply2D(
const int D1D,
1828 const Array<real_t> &bo,
1829 const Array<real_t> &bot,
1830 const Array<real_t> &bt,
1831 const Array<real_t> &gc,
1832 const Vector &pa_data,
1837void PAHcurlL2ApplyTranspose2D(
const int D1D,
1841 const Array<real_t> &bo,
1842 const Array<real_t> &bot,
1843 const Array<real_t> &
b,
1844 const Array<real_t> &gct,
1845 const Vector &pa_data,
1850template<
int T_D1D = 0,
int T_Q1D = 0>
1851inline void PAHcurlL2Apply3D(
const int d1d,
1855 const Array<real_t> &bo,
1856 const Array<real_t> &bc,
1857 const Array<real_t> &bot,
1858 const Array<real_t> &bct,
1859 const Array<real_t> &gc,
1860 const Vector &pa_data,
1865 "Error: d1d > HCURL_MAX_D1D");
1867 "Error: q1d > HCURL_MAX_Q1D");
1868 const int D1D = T_D1D ? T_D1D : d1d;
1869 const int Q1D = T_Q1D ? T_Q1D : q1d;
1871 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
1872 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
1873 auto Bot =
Reshape(bot.Read(), D1D-1, Q1D);
1874 auto Bct =
Reshape(bct.Read(), D1D, Q1D);
1875 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
1876 auto op =
Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
1877 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
1878 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
1891 constexpr int VDIM = 3;
1892 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
1893 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
1894 const int D1D = T_D1D ? T_D1D : d1d;
1895 const int Q1D = T_Q1D ? T_Q1D : q1d;
1897 real_t curl[MQ1D][MQ1D][MQ1D][VDIM];
1900 for (
int qz = 0; qz < Q1D; ++qz)
1902 for (
int qy = 0; qy < Q1D; ++qy)
1904 for (
int qx = 0; qx < Q1D; ++qx)
1906 for (
int c = 0; c < VDIM; ++c)
1908 curl[qz][qy][qx][c] = 0.0;
1920 const int D1Dz = D1D;
1921 const int D1Dy = D1D;
1922 const int D1Dx = D1D - 1;
1924 for (
int dz = 0; dz < D1Dz; ++dz)
1926 real_t gradXY[MQ1D][MQ1D][2];
1927 for (
int qy = 0; qy < Q1D; ++qy)
1929 for (
int qx = 0; qx < Q1D; ++qx)
1931 for (
int d = 0; d < 2; ++d)
1933 gradXY[qy][qx][d] = 0.0;
1938 for (
int dy = 0; dy < D1Dy; ++dy)
1941 for (
int qx = 0; qx < Q1D; ++qx)
1946 for (
int dx = 0; dx < D1Dx; ++dx)
1948 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1949 for (
int qx = 0; qx < Q1D; ++qx)
1951 massX[qx] += t * Bo(qx,dx);
1955 for (
int qy = 0; qy < Q1D; ++qy)
1957 const real_t wy = Bc(qy,dy);
1958 const real_t wDy = Gc(qy,dy);
1959 for (
int qx = 0; qx < Q1D; ++qx)
1961 const real_t wx = massX[qx];
1962 gradXY[qy][qx][0] += wx * wDy;
1963 gradXY[qy][qx][1] += wx * wy;
1968 for (
int qz = 0; qz < Q1D; ++qz)
1970 const real_t wz = Bc(qz,dz);
1971 const real_t wDz = Gc(qz,dz);
1972 for (
int qy = 0; qy < Q1D; ++qy)
1974 for (
int qx = 0; qx < Q1D; ++qx)
1977 curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz;
1978 curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;
1984 osc += D1Dx * D1Dy * D1Dz;
1989 const int D1Dz = D1D;
1990 const int D1Dy = D1D - 1;
1991 const int D1Dx = D1D;
1993 for (
int dz = 0; dz < D1Dz; ++dz)
1995 real_t gradXY[MQ1D][MQ1D][2];
1996 for (
int qy = 0; qy < Q1D; ++qy)
1998 for (
int qx = 0; qx < Q1D; ++qx)
2000 for (
int d = 0; d < 2; ++d)
2002 gradXY[qy][qx][d] = 0.0;
2007 for (
int dx = 0; dx < D1Dx; ++dx)
2010 for (
int qy = 0; qy < Q1D; ++qy)
2015 for (
int dy = 0; dy < D1Dy; ++dy)
2017 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2018 for (
int qy = 0; qy < Q1D; ++qy)
2020 massY[qy] += t * Bo(qy,dy);
2024 for (
int qx = 0; qx < Q1D; ++qx)
2026 const real_t wx = Bc(qx,dx);
2027 const real_t wDx = Gc(qx,dx);
2028 for (
int qy = 0; qy < Q1D; ++qy)
2030 const real_t wy = massY[qy];
2031 gradXY[qy][qx][0] += wDx * wy;
2032 gradXY[qy][qx][1] += wx * wy;
2037 for (
int qz = 0; qz < Q1D; ++qz)
2039 const real_t wz = Bc(qz,dz);
2040 const real_t wDz = Gc(qz,dz);
2041 for (
int qy = 0; qy < Q1D; ++qy)
2043 for (
int qx = 0; qx < Q1D; ++qx)
2046 curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz;
2047 curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;
2053 osc += D1Dx * D1Dy * D1Dz;
2058 const int D1Dz = D1D - 1;
2059 const int D1Dy = D1D;
2060 const int D1Dx = D1D;
2062 for (
int dx = 0; dx < D1Dx; ++dx)
2064 real_t gradYZ[MQ1D][MQ1D][2];
2065 for (
int qz = 0; qz < Q1D; ++qz)
2067 for (
int qy = 0; qy < Q1D; ++qy)
2069 for (
int d = 0; d < 2; ++d)
2071 gradYZ[qz][qy][d] = 0.0;
2076 for (
int dy = 0; dy < D1Dy; ++dy)
2079 for (
int qz = 0; qz < Q1D; ++qz)
2084 for (
int dz = 0; dz < D1Dz; ++dz)
2086 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2087 for (
int qz = 0; qz < Q1D; ++qz)
2089 massZ[qz] += t * Bo(qz,dz);
2093 for (
int qy = 0; qy < Q1D; ++qy)
2095 const real_t wy = Bc(qy,dy);
2096 const real_t wDy = Gc(qy,dy);
2097 for (
int qz = 0; qz < Q1D; ++qz)
2099 const real_t wz = massZ[qz];
2100 gradYZ[qz][qy][0] += wz * wy;
2101 gradYZ[qz][qy][1] += wz * wDy;
2106 for (
int qx = 0; qx < Q1D; ++qx)
2108 const real_t wx = Bc(qx,dx);
2109 const real_t wDx = Gc(qx,dx);
2111 for (
int qy = 0; qy < Q1D; ++qy)
2113 for (
int qz = 0; qz < Q1D; ++qz)
2116 curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx;
2117 curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx;
2125 for (
int qz = 0; qz < Q1D; ++qz)
2127 for (
int qy = 0; qy < Q1D; ++qy)
2129 for (
int qx = 0; qx < Q1D; ++qx)
2131 const real_t O11 = op(0,qx,qy,qz,e);
2134 for (
int c = 0; c < VDIM; ++c)
2136 curl[qz][qy][qx][c] *= O11;
2141 const real_t O21 = op(1,qx,qy,qz,e);
2142 const real_t O31 = op(2,qx,qy,qz,e);
2143 const real_t O12 = op(3,qx,qy,qz,e);
2144 const real_t O22 = op(4,qx,qy,qz,e);
2145 const real_t O32 = op(5,qx,qy,qz,e);
2146 const real_t O13 = op(6,qx,qy,qz,e);
2147 const real_t O23 = op(7,qx,qy,qz,e);
2148 const real_t O33 = op(8,qx,qy,qz,e);
2149 const real_t curlX = curl[qz][qy][qx][0];
2150 const real_t curlY = curl[qz][qy][qx][1];
2151 const real_t curlZ = curl[qz][qy][qx][2];
2152 curl[qz][qy][qx][0] = (O11*curlX)+(O12*curlY)+(O13*curlZ);
2153 curl[qz][qy][qx][1] = (O21*curlX)+(O22*curlY)+(O23*curlZ);
2154 curl[qz][qy][qx][2] = (O31*curlX)+(O32*curlY)+(O33*curlZ);
2160 for (
int qz = 0; qz < Q1D; ++qz)
2162 real_t massXY[MD1D][MD1D];
2166 for (
int c = 0; c < VDIM; ++c)
2168 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2169 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2170 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2172 for (
int dy = 0; dy < D1Dy; ++dy)
2174 for (
int dx = 0; dx < D1Dx; ++dx)
2179 for (
int qy = 0; qy < Q1D; ++qy)
2182 for (
int dx = 0; dx < D1Dx; ++dx)
2186 for (
int qx = 0; qx < Q1D; ++qx)
2188 for (
int dx = 0; dx < D1Dx; ++dx)
2190 massX[dx] += curl[qz][qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
2194 for (
int dy = 0; dy < D1Dy; ++dy)
2196 const real_t wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
2197 for (
int dx = 0; dx < D1Dx; ++dx)
2199 massXY[dy][dx] += massX[dx] * wy;
2204 for (
int dz = 0; dz < D1Dz; ++dz)
2206 const real_t wz = (c == 2) ? Bot(dz,qz) : Bct(dz,qz);
2207 for (
int dy = 0; dy < D1Dy; ++dy)
2209 for (
int dx = 0; dx < D1Dx; ++dx)
2211 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += massXY[dy][dx] * wz;
2216 osc += D1Dx * D1Dy * D1Dz;
2223template<
int T_D1D = 0,
int T_Q1D = 0>
2224inline void SmemPAHcurlL2Apply3D(
const int d1d,
2228 const Array<real_t> &bo,
2229 const Array<real_t> &bc,
2230 const Array<real_t> &gc,
2231 const Vector &pa_data,
2236 "Error: d1d > HCURL_MAX_D1D");
2238 "Error: q1d > HCURL_MAX_Q1D");
2239 const int D1D = T_D1D ? T_D1D : d1d;
2240 const int Q1D = T_Q1D ? T_Q1D : q1d;
2242 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
2243 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
2244 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
2245 auto op =
Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
2246 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
2247 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
2249 auto device_kernel = [=] MFEM_DEVICE (
int e)
2251 constexpr int VDIM = 3;
2252 constexpr int maxCoeffDim = 9;
2253 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
2254 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
2255 const int D1D = T_D1D ? T_D1D : d1d;
2256 const int Q1D = T_Q1D ? T_Q1D : q1d;
2258 MFEM_SHARED
real_t sBo[MD1D][MQ1D];
2259 MFEM_SHARED
real_t sBc[MD1D][MQ1D];
2260 MFEM_SHARED
real_t sGc[MD1D][MQ1D];
2263 MFEM_SHARED
real_t sop[maxCoeffDim][MQ1D][MQ1D];
2264 MFEM_SHARED
real_t curl[MQ1D][MQ1D][3];
2266 MFEM_SHARED
real_t sX[MD1D][MD1D][MD1D];
2268 MFEM_FOREACH_THREAD(qx,x,Q1D)
2270 MFEM_FOREACH_THREAD(qy,y,Q1D)
2272 MFEM_FOREACH_THREAD(qz,z,Q1D)
2274 for (
int i=0; i<coeffDim; ++i)
2276 opc[i] = op(i,qx,qy,qz,e);
2282 const int tidx = MFEM_THREAD_ID(x);
2283 const int tidy = MFEM_THREAD_ID(y);
2284 const int tidz = MFEM_THREAD_ID(z);
2288 MFEM_FOREACH_THREAD(d,y,D1D)
2290 MFEM_FOREACH_THREAD(q,x,Q1D)
2292 sBc[d][q] = Bc(q,d);
2293 sGc[d][q] = Gc(q,d);
2296 sBo[d][q] = Bo(q,d);
2303 for (
int qz=0; qz < Q1D; ++qz)
2307 MFEM_FOREACH_THREAD(qy,y,Q1D)
2309 MFEM_FOREACH_THREAD(qx,x,Q1D)
2311 for (
int i=0; i<3; ++i)
2313 curl[qy][qx][i] = 0.0;
2320 for (
int c = 0; c < VDIM; ++c)
2322 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2323 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2324 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2326 MFEM_FOREACH_THREAD(dz,z,D1Dz)
2328 MFEM_FOREACH_THREAD(dy,y,D1Dy)
2330 MFEM_FOREACH_THREAD(dx,x,D1Dx)
2332 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2342 for (
int i=0; i<coeffDim; ++i)
2344 sop[i][tidx][tidy] = opc[i];
2348 MFEM_FOREACH_THREAD(qy,y,Q1D)
2350 MFEM_FOREACH_THREAD(qx,x,Q1D)
2360 for (
int dz = 0; dz < D1Dz; ++dz)
2362 const real_t wz = sBc[dz][qz];
2363 const real_t wDz = sGc[dz][qz];
2365 for (
int dy = 0; dy < D1Dy; ++dy)
2367 const real_t wy = sBc[dy][qy];
2368 const real_t wDy = sGc[dy][qy];
2370 for (
int dx = 0; dx < D1Dx; ++dx)
2372 const real_t wx = sX[dz][dy][dx] * sBo[dx][qx];
2379 curl[qy][qx][1] += v;
2380 curl[qy][qx][2] -=
u;
2386 for (
int dz = 0; dz < D1Dz; ++dz)
2388 const real_t wz = sBc[dz][qz];
2389 const real_t wDz = sGc[dz][qz];
2391 for (
int dy = 0; dy < D1Dy; ++dy)
2393 const real_t wy = sBo[dy][qy];
2395 for (
int dx = 0; dx < D1Dx; ++dx)
2397 const real_t t = sX[dz][dy][dx];
2398 const real_t wx = t * sBc[dx][qx];
2399 const real_t wDx = t * sGc[dx][qx];
2407 curl[qy][qx][0] -= v;
2408 curl[qy][qx][2] +=
u;
2414 for (
int dz = 0; dz < D1Dz; ++dz)
2416 const real_t wz = sBo[dz][qz];
2418 for (
int dy = 0; dy < D1Dy; ++dy)
2420 const real_t wy = sBc[dy][qy];
2421 const real_t wDy = sGc[dy][qy];
2423 for (
int dx = 0; dx < D1Dx; ++dx)
2425 const real_t t = sX[dz][dy][dx];
2426 const real_t wx = t * sBc[dx][qx];
2427 const real_t wDx = t * sGc[dx][qx];
2435 curl[qy][qx][0] += v;
2436 curl[qy][qx][1] -=
u;
2442 osc += D1Dx * D1Dy * D1Dz;
2450 MFEM_FOREACH_THREAD(dz,z,D1D)
2452 const real_t wcz = sBc[dz][qz];
2453 const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
2455 MFEM_FOREACH_THREAD(dy,y,D1D)
2457 MFEM_FOREACH_THREAD(dx,x,D1D)
2459 for (
int qy = 0; qy < Q1D; ++qy)
2461 const real_t wcy = sBc[dy][qy];
2462 const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
2464 for (
int qx = 0; qx < Q1D; ++qx)
2466 const real_t O11 = sop[0][qx][qy];
2470 c1 = O11 * curl[qy][qx][0];
2471 c2 = O11 * curl[qy][qx][1];
2472 c3 = O11 * curl[qy][qx][2];
2476 const real_t O21 = sop[1][qx][qy];
2477 const real_t O31 = sop[2][qx][qy];
2478 const real_t O12 = sop[3][qx][qy];
2479 const real_t O22 = sop[4][qx][qy];
2480 const real_t O32 = sop[5][qx][qy];
2481 const real_t O13 = sop[6][qx][qy];
2482 const real_t O23 = sop[7][qx][qy];
2483 const real_t O33 = sop[8][qx][qy];
2484 c1 = (O11*curl[qy][qx][0])+(O12*curl[qy][qx][1])+(O13*curl[qy][qx][2]);
2485 c2 = (O21*curl[qy][qx][0])+(O22*curl[qy][qx][1])+(O23*curl[qy][qx][2]);
2486 c3 = (O31*curl[qy][qx][0])+(O32*curl[qy][qx][1])+(O33*curl[qy][qx][2]);
2489 const real_t wcx = sBc[dx][qx];
2493 const real_t wx = sBo[dx][qx];
2494 dxyz1 += c1 * wx * wcy * wcz;
2497 dxyz2 += c2 * wcx * wy * wcz;
2498 dxyz3 += c3 * wcx * wcy * wz;
2507 MFEM_FOREACH_THREAD(dz,z,D1D)
2509 MFEM_FOREACH_THREAD(dy,y,D1D)
2511 MFEM_FOREACH_THREAD(dx,x,D1D)
2515 Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
2519 Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
2523 Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
2531 auto host_kernel = [&] MFEM_LAMBDA (
int)
2533 MFEM_ABORT_KERNEL(
"This kernel should only be used on GPU.");
2536 ForallWrap<3>(
true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
2540template<
int T_D1D = 0,
int T_Q1D = 0>
2541inline void PAHcurlL2ApplyTranspose3D(
const int d1d,
2545 const Array<real_t> &bo,
2546 const Array<real_t> &bc,
2547 const Array<real_t> &bot,
2548 const Array<real_t> &bct,
2549 const Array<real_t> &gct,
2550 const Vector &pa_data,
2556 "Error: d1d > HCURL_MAX_D1D");
2558 "Error: q1d > HCURL_MAX_Q1D");
2559 const int D1D = T_D1D ? T_D1D : d1d;
2560 const int Q1D = T_Q1D ? T_Q1D : q1d;
2562 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
2563 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
2564 auto Bot =
Reshape(bot.Read(), D1D-1, Q1D);
2565 auto Bct =
Reshape(bct.Read(), D1D, Q1D);
2566 auto Gct =
Reshape(gct.Read(), D1D, Q1D);
2567 auto op =
Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
2568 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
2569 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
2573 constexpr int VDIM = 3;
2574 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
2575 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
2576 const int D1D = T_D1D ? T_D1D : d1d;
2577 const int Q1D = T_Q1D ? T_Q1D : q1d;
2579 real_t mass[MQ1D][MQ1D][MQ1D][VDIM];
2581 for (
int qz = 0; qz < Q1D; ++qz)
2583 for (
int qy = 0; qy < Q1D; ++qy)
2585 for (
int qx = 0; qx < Q1D; ++qx)
2587 for (
int c = 0; c < VDIM; ++c)
2589 mass[qz][qy][qx][c] = 0.0;
2597 for (
int c = 0; c < VDIM; ++c)
2599 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2600 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2601 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2603 for (
int dz = 0; dz < D1Dz; ++dz)
2605 real_t massXY[MQ1D][MQ1D];
2606 for (
int qy = 0; qy < Q1D; ++qy)
2608 for (
int qx = 0; qx < Q1D; ++qx)
2610 massXY[qy][qx] = 0.0;
2614 for (
int dy = 0; dy < D1Dy; ++dy)
2617 for (
int qx = 0; qx < Q1D; ++qx)
2622 for (
int dx = 0; dx < D1Dx; ++dx)
2624 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2625 for (
int qx = 0; qx < Q1D; ++qx)
2627 massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
2631 for (
int qy = 0; qy < Q1D; ++qy)
2633 const real_t wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
2634 for (
int qx = 0; qx < Q1D; ++qx)
2636 const real_t wx = massX[qx];
2637 massXY[qy][qx] += wx * wy;
2642 for (
int qz = 0; qz < Q1D; ++qz)
2644 const real_t wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
2645 for (
int qy = 0; qy < Q1D; ++qy)
2647 for (
int qx = 0; qx < Q1D; ++qx)
2649 mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
2655 osc += D1Dx * D1Dy * D1Dz;
2659 for (
int qz = 0; qz < Q1D; ++qz)
2661 for (
int qy = 0; qy < Q1D; ++qy)
2663 for (
int qx = 0; qx < Q1D; ++qx)
2665 const real_t O11 = op(0,qx,qy,qz,e);
2668 for (
int c = 0; c < VDIM; ++c)
2670 mass[qz][qy][qx][c] *= O11;
2675 const real_t O12 = op(1,qx,qy,qz,e);
2676 const real_t O13 = op(2,qx,qy,qz,e);
2677 const real_t O21 = op(3,qx,qy,qz,e);
2678 const real_t O22 = op(4,qx,qy,qz,e);
2679 const real_t O23 = op(5,qx,qy,qz,e);
2680 const real_t O31 = op(6,qx,qy,qz,e);
2681 const real_t O32 = op(7,qx,qy,qz,e);
2682 const real_t O33 = op(8,qx,qy,qz,e);
2683 const real_t massX = mass[qz][qy][qx][0];
2684 const real_t massY = mass[qz][qy][qx][1];
2685 const real_t massZ = mass[qz][qy][qx][2];
2686 mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
2687 mass[qz][qy][qx][1] = (O21*massX)+(O22*massY)+(O23*massZ);
2688 mass[qz][qy][qx][2] = (O31*massX)+(O32*massY)+(O33*massZ);
2697 const int D1Dz = D1D;
2698 const int D1Dy = D1D;
2699 const int D1Dx = D1D - 1;
2701 for (
int qz = 0; qz < Q1D; ++qz)
2703 real_t gradXY12[MD1D][MD1D];
2704 real_t gradXY21[MD1D][MD1D];
2706 for (
int dy = 0; dy < D1Dy; ++dy)
2708 for (
int dx = 0; dx < D1Dx; ++dx)
2710 gradXY12[dy][dx] = 0.0;
2711 gradXY21[dy][dx] = 0.0;
2714 for (
int qy = 0; qy < Q1D; ++qy)
2717 for (
int dx = 0; dx < D1Dx; ++dx)
2719 for (
int n = 0; n < 2; ++n)
2724 for (
int qx = 0; qx < Q1D; ++qx)
2726 for (
int dx = 0; dx < D1Dx; ++dx)
2728 const real_t wx = Bot(dx,qx);
2730 massX[dx][0] += wx * mass[qz][qy][qx][1];
2731 massX[dx][1] += wx * mass[qz][qy][qx][2];
2734 for (
int dy = 0; dy < D1Dy; ++dy)
2736 const real_t wy = Bct(dy,qy);
2737 const real_t wDy = Gct(dy,qy);
2739 for (
int dx = 0; dx < D1Dx; ++dx)
2741 gradXY21[dy][dx] += massX[dx][0] * wy;
2742 gradXY12[dy][dx] += massX[dx][1] * wDy;
2747 for (
int dz = 0; dz < D1Dz; ++dz)
2749 const real_t wz = Bct(dz,qz);
2750 const real_t wDz = Gct(dz,qz);
2751 for (
int dy = 0; dy < D1Dy; ++dy)
2753 for (
int dx = 0; dx < D1Dx; ++dx)
2757 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
2758 e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
2764 osc += D1Dx * D1Dy * D1Dz;
2769 const int D1Dz = D1D;
2770 const int D1Dy = D1D - 1;
2771 const int D1Dx = D1D;
2773 for (
int qz = 0; qz < Q1D; ++qz)
2775 real_t gradXY02[MD1D][MD1D];
2776 real_t gradXY20[MD1D][MD1D];
2778 for (
int dy = 0; dy < D1Dy; ++dy)
2780 for (
int dx = 0; dx < D1Dx; ++dx)
2782 gradXY02[dy][dx] = 0.0;
2783 gradXY20[dy][dx] = 0.0;
2786 for (
int qx = 0; qx < Q1D; ++qx)
2789 for (
int dy = 0; dy < D1Dy; ++dy)
2794 for (
int qy = 0; qy < Q1D; ++qy)
2796 for (
int dy = 0; dy < D1Dy; ++dy)
2798 const real_t wy = Bot(dy,qy);
2800 massY[dy][0] += wy * mass[qz][qy][qx][2];
2801 massY[dy][1] += wy * mass[qz][qy][qx][0];
2804 for (
int dx = 0; dx < D1Dx; ++dx)
2806 const real_t wx = Bct(dx,qx);
2807 const real_t wDx = Gct(dx,qx);
2809 for (
int dy = 0; dy < D1Dy; ++dy)
2811 gradXY02[dy][dx] += massY[dy][0] * wDx;
2812 gradXY20[dy][dx] += massY[dy][1] * wx;
2817 for (
int dz = 0; dz < D1Dz; ++dz)
2819 const real_t wz = Bct(dz,qz);
2820 const real_t wDz = Gct(dz,qz);
2821 for (
int dy = 0; dy < D1Dy; ++dy)
2823 for (
int dx = 0; dx < D1Dx; ++dx)
2827 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
2828 e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
2834 osc += D1Dx * D1Dy * D1Dz;
2839 const int D1Dz = D1D - 1;
2840 const int D1Dy = D1D;
2841 const int D1Dx = D1D;
2843 for (
int qx = 0; qx < Q1D; ++qx)
2845 real_t gradYZ01[MD1D][MD1D];
2846 real_t gradYZ10[MD1D][MD1D];
2848 for (
int dy = 0; dy < D1Dy; ++dy)
2850 for (
int dz = 0; dz < D1Dz; ++dz)
2852 gradYZ01[dz][dy] = 0.0;
2853 gradYZ10[dz][dy] = 0.0;
2856 for (
int qy = 0; qy < Q1D; ++qy)
2859 for (
int dz = 0; dz < D1Dz; ++dz)
2861 for (
int n = 0; n < 2; ++n)
2866 for (
int qz = 0; qz < Q1D; ++qz)
2868 for (
int dz = 0; dz < D1Dz; ++dz)
2870 const real_t wz = Bot(dz,qz);
2872 massZ[dz][0] += wz * mass[qz][qy][qx][0];
2873 massZ[dz][1] += wz * mass[qz][qy][qx][1];
2876 for (
int dy = 0; dy < D1Dy; ++dy)
2878 const real_t wy = Bct(dy,qy);
2879 const real_t wDy = Gct(dy,qy);
2881 for (
int dz = 0; dz < D1Dz; ++dz)
2883 gradYZ01[dz][dy] += wy * massZ[dz][1];
2884 gradYZ10[dz][dy] += wDy * massZ[dz][0];
2889 for (
int dx = 0; dx < D1Dx; ++dx)
2891 const real_t wx = Bct(dx,qx);
2892 const real_t wDx = Gct(dx,qx);
2894 for (
int dy = 0; dy < D1Dy; ++dy)
2896 for (
int dz = 0; dz < D1Dz; ++dz)
2900 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
2901 e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
2911template<
int T_D1D = 0,
int T_Q1D = 0>
2912inline void SmemPAHcurlL2ApplyTranspose3D(
const int d1d,
2916 const Array<real_t> &bo,
2917 const Array<real_t> &bc,
2918 const Array<real_t> &gc,
2919 const Vector &pa_data,
2924 "Error: d1d > HCURL_MAX_D1D");
2926 "Error: q1d > HCURL_MAX_Q1D");
2927 const int D1D = T_D1D ? T_D1D : d1d;
2928 const int Q1D = T_Q1D ? T_Q1D : q1d;
2930 auto Bo =
Reshape(bo.Read(), Q1D, D1D-1);
2931 auto Bc =
Reshape(bc.Read(), Q1D, D1D);
2932 auto Gc =
Reshape(gc.Read(), Q1D, D1D);
2933 auto op =
Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
2934 auto X =
Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
2935 auto Y =
Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
2937 auto device_kernel = [=] MFEM_DEVICE (
int e)
2939 constexpr int VDIM = 3;
2940 constexpr int maxCoeffDim = 9;
2941 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
2942 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
2943 const int D1D = T_D1D ? T_D1D : d1d;
2944 const int Q1D = T_Q1D ? T_Q1D : q1d;
2946 MFEM_SHARED
real_t sBo[MD1D][MQ1D];
2947 MFEM_SHARED
real_t sBc[MD1D][MQ1D];
2948 MFEM_SHARED
real_t sGc[MD1D][MQ1D];
2951 MFEM_SHARED
real_t sop[maxCoeffDim][MQ1D][MQ1D];
2952 MFEM_SHARED
real_t mass[MQ1D][MQ1D][3];
2954 MFEM_SHARED
real_t sX[MD1D][MD1D][MD1D];
2956 MFEM_FOREACH_THREAD(qx,x,Q1D)
2958 MFEM_FOREACH_THREAD(qy,y,Q1D)
2960 MFEM_FOREACH_THREAD(qz,z,Q1D)
2962 for (
int i=0; i<coeffDim; ++i)
2964 opc[i] = op(i,qx,qy,qz,e);
2970 const int tidx = MFEM_THREAD_ID(x);
2971 const int tidy = MFEM_THREAD_ID(y);
2972 const int tidz = MFEM_THREAD_ID(z);
2976 MFEM_FOREACH_THREAD(d,y,D1D)
2978 MFEM_FOREACH_THREAD(q,x,Q1D)
2980 sBc[d][q] = Bc(q,d);
2981 sGc[d][q] = Gc(q,d);
2984 sBo[d][q] = Bo(q,d);
2991 for (
int qz=0; qz < Q1D; ++qz)
2995 MFEM_FOREACH_THREAD(qy,y,Q1D)
2997 MFEM_FOREACH_THREAD(qx,x,Q1D)
2999 for (
int i=0; i<3; ++i)
3001 mass[qy][qx][i] = 0.0;
3008 for (
int c = 0; c < VDIM; ++c)
3010 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
3011 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
3012 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
3014 MFEM_FOREACH_THREAD(dz,z,D1Dz)
3016 MFEM_FOREACH_THREAD(dy,y,D1Dy)
3018 MFEM_FOREACH_THREAD(dx,x,D1Dx)
3020 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
3030 for (
int i=0; i<coeffDim; ++i)
3032 sop[i][tidx][tidy] = opc[i];
3036 MFEM_FOREACH_THREAD(qy,y,Q1D)
3038 MFEM_FOREACH_THREAD(qx,x,Q1D)
3042 for (
int dz = 0; dz < D1Dz; ++dz)
3044 const real_t wz = (c == 2) ? sBo[dz][qz] : sBc[dz][qz];
3046 for (
int dy = 0; dy < D1Dy; ++dy)
3048 const real_t wy = (c == 1) ? sBo[dy][qy] : sBc[dy][qy];
3050 for (
int dx = 0; dx < D1Dx; ++dx)
3052 const real_t wx = sX[dz][dy][dx] * ((c == 0) ? sBo[dx][qx] : sBc[dx][qx]);
3058 mass[qy][qx][c] +=
u;
3063 osc += D1Dx * D1Dy * D1Dz;
3071 MFEM_FOREACH_THREAD(dz,z,D1D)
3073 const real_t wcz = sBc[dz][qz];
3074 const real_t wcDz = sGc[dz][qz];
3075 const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
3077 MFEM_FOREACH_THREAD(dy,y,D1D)
3079 MFEM_FOREACH_THREAD(dx,x,D1D)
3081 for (
int qy = 0; qy < Q1D; ++qy)
3083 const real_t wcy = sBc[dy][qy];
3084 const real_t wcDy = sGc[dy][qy];
3085 const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
3087 for (
int qx = 0; qx < Q1D; ++qx)
3089 const real_t O11 = sop[0][qx][qy];
3093 c1 = O11 * mass[qy][qx][0];
3094 c2 = O11 * mass[qy][qx][1];
3095 c3 = O11 * mass[qy][qx][2];
3099 const real_t O12 = sop[1][qx][qy];
3100 const real_t O13 = sop[2][qx][qy];
3101 const real_t O21 = sop[3][qx][qy];
3102 const real_t O22 = sop[4][qx][qy];
3103 const real_t O23 = sop[5][qx][qy];
3104 const real_t O31 = sop[6][qx][qy];
3105 const real_t O32 = sop[7][qx][qy];
3106 const real_t O33 = sop[8][qx][qy];
3108 c1 = (O11*mass[qy][qx][0])+(O12*mass[qy][qx][1])+(O13*mass[qy][qx][2]);
3109 c2 = (O21*mass[qy][qx][0])+(O22*mass[qy][qx][1])+(O23*mass[qy][qx][2]);
3110 c3 = (O31*mass[qy][qx][0])+(O32*mass[qy][qx][1])+(O33*mass[qy][qx][2]);
3113 const real_t wcx = sBc[dx][qx];
3114 const real_t wDx = sGc[dx][qx];
3118 const real_t wx = sBo[dx][qx];
3119 dxyz1 += (wx * c2 * wcy * wcDz) - (wx * c3 * wcDy * wcz);
3122 dxyz2 += (-wy * c1 * wcx * wcDz) + (wy * c3 * wDx * wcz);
3124 dxyz3 += (wcDy * wz * c1 * wcx) - (wcy * wz * c2 * wDx);
3133 MFEM_FOREACH_THREAD(dz,z,D1D)
3135 MFEM_FOREACH_THREAD(dy,y,D1D)
3137 MFEM_FOREACH_THREAD(dx,x,D1D)
3141 Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
3145 Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
3149 Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
3157 auto host_kernel = [&] MFEM_LAMBDA (
int)
3159 MFEM_ABORT_KERNEL(
"This kernel should only be used on GPU.");
3162 ForallWrap<3>(
true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
3167template<
int DIM,
int T_D1D,
int T_Q1D>
3170 if constexpr (
DIM == 2)
3172 return internal::PACurlCurlApply2D;
3174 else if constexpr (
DIM == 3)
3178 return internal::SmemPACurlCurlApply3D<T_D1D, T_Q1D>;
3182 return internal::PACurlCurlApply3D;
3188template <
int DIM,
int T_D1D,
int T_Q1D>
3190CurlCurlIntegrator::DiagonalPAKernels::Kernel()
3192 if constexpr (
DIM == 2)
3194 return internal::PACurlCurlAssembleDiagonal2D;
3196 else if constexpr (
DIM == 3)
3200 return internal::SmemPACurlCurlAssembleDiagonal3D<T_D1D, T_Q1D>;
3204 return internal::PACurlCurlAssembleDiagonal3D;
void(*)(const int, const int, const bool, const int, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Vector &, Vector &) DiagonalKernelType
arguments: d1d, q1d, symmetric, ne, Bo, Bc, Go, Gc, pa_data, diag
void(*)( const int, const int, const bool, const int, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const bool) ApplyKernelType
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
real_t u(const Vector &xvec)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
void forall(int N, lambda &&body)
@ DEVICE_MASK
Biwise-OR of all device backends.
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.