12#ifndef MFEM_BILININTEG_DIFFUSION_KERNELS_HPP
13#define MFEM_BILININTEG_DIFFUSION_KERNELS_HPP
30void PADiffusionSetup(
const int dim,
36 const Array<real_t> &W,
43void PADiffusionSetup2D(
const int Q1D,
46 const Array<real_t> &w,
52void PADiffusionSetup3D(
const int Q1D,
55 const Array<real_t> &w,
62void OccaPADiffusionSetup2D(
const int D1D,
65 const Array<real_t> &W,
71void OccaPADiffusionSetup3D(
const int D1D,
74 const Array<real_t> &W,
80void PADiffusionAssembleDiagonal(
const int dim,
85 const Array<real_t> &B,
86 const Array<real_t> &G,
91template<
int T_D1D = 0,
int T_Q1D = 0>
92inline void PADiffusionDiagonal2D(
const int NE,
94 const Array<real_t> &
b,
95 const Array<real_t> &g,
101 const int D1D = T_D1D ? T_D1D : d1d;
102 const int Q1D = T_Q1D ? T_Q1D : q1d;
105 auto B =
Reshape(
b.Read(), Q1D, D1D);
106 auto G =
Reshape(g.Read(), Q1D, D1D);
109 auto D =
Reshape(d.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
110 auto Y =
Reshape(y.ReadWrite(), D1D, D1D, NE);
113 const int D1D = T_D1D ? T_D1D : d1d;
114 const int Q1D = T_Q1D ? T_Q1D : q1d;
115 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
116 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
121 for (
int qx = 0; qx < Q1D; ++qx)
123 for (
int dy = 0; dy < D1D; ++dy)
128 for (
int qy = 0; qy < Q1D; ++qy)
130 const int q = qx + qy * Q1D;
131 const real_t D00 = D(q,0,e);
132 const real_t D10 = D(q,1,e);
133 const real_t D01 = symmetric ? D10 : D(q,2,e);
134 const real_t D11 = symmetric ? D(q,2,e) : D(q,3,e);
135 QD0[qx][dy] += B(qy, dy) * B(qy, dy) * D00;
136 QD1[qx][dy] += B(qy, dy) * G(qy, dy) * (D01 + D10);
137 QD2[qx][dy] += G(qy, dy) * G(qy, dy) * D11;
141 for (
int dy = 0; dy < D1D; ++dy)
143 for (
int dx = 0; dx < D1D; ++dx)
145 for (
int qx = 0; qx < Q1D; ++qx)
147 Y(dx,dy,e) += G(qx, dx) * G(qx, dx) * QD0[qx][dy];
148 Y(dx,dy,e) += G(qx, dx) * B(qx, dx) * QD1[qx][dy];
149 Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD2[qx][dy];
158constexpr int ipow(
int x,
int p) {
return p == 0 ? 1 : x*ipow(x,
p-1); }
159constexpr int D11(
int x) {
return (11 - x)/2; }
160constexpr int D10(
int x) {
return (10 - x)/2; }
161constexpr int NBZApply(
int D1D)
163 return ipow(2, D11(D1D) >= 0 ? D11(D1D) : 0);
165constexpr int NBZDiagonal(
int D1D)
167 return ipow(2, D10(D1D) >= 0 ? D10(D1D) : 0);
172template<
int T_D1D = 0,
int T_Q1D = 0>
173inline void SmemPADiffusionDiagonal2D(
const int NE,
174 const bool symmetric,
175 const Array<real_t> &b_,
176 const Array<real_t> &g_,
182 static constexpr int T_NBZ = diffusion::NBZDiagonal(T_D1D);
183 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
184 const int D1D = T_D1D ? T_D1D : d1d;
185 const int Q1D = T_Q1D ? T_Q1D : q1d;
188 MFEM_VERIFY(D1D <= max_d1d,
"");
189 MFEM_VERIFY(Q1D <= max_q1d,
"");
190 auto b =
Reshape(b_.Read(), Q1D, D1D);
191 auto g =
Reshape(g_.Read(), Q1D, D1D);
192 auto D =
Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
193 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
196 const int tidz = MFEM_THREAD_ID(z);
197 const int D1D = T_D1D ? T_D1D : d1d;
198 const int Q1D = T_Q1D ? T_Q1D : q1d;
199 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
200 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
201 MFEM_SHARED
real_t BG[2][MQ1*MD1];
204 MFEM_SHARED
real_t QD[3][NBZ][MD1][MQ1];
210 MFEM_FOREACH_THREAD(d,y,D1D)
212 MFEM_FOREACH_THREAD(q,x,Q1D)
220 MFEM_FOREACH_THREAD(qx,x,Q1D)
222 MFEM_FOREACH_THREAD(dy,y,D1D)
227 for (
int qy = 0; qy < Q1D; ++qy)
229 const int q = qx + qy * Q1D;
230 const real_t D00 = D(q,0,e);
231 const real_t D10 = D(q,1,e);
232 const real_t D01 = symmetric ? D10 : D(q,2,e);
233 const real_t D11 = symmetric ? D(q,2,e) : D(q,3,e);
234 const real_t By = B[qy][dy];
235 const real_t Gy = G[qy][dy];
236 const real_t BBy = By * By;
237 const real_t BGy = By * Gy;
238 const real_t GGy = Gy * Gy;
239 QD0[qx][dy] += BBy * D00;
240 QD1[qx][dy] += BGy * (D01 + D10);
241 QD2[qx][dy] += GGy * D11;
246 MFEM_FOREACH_THREAD(dy,y,D1D)
248 MFEM_FOREACH_THREAD(dx,x,D1D)
250 for (
int qx = 0; qx < Q1D; ++qx)
252 const real_t Bx = B[qx][dx];
253 const real_t Gx = G[qx][dx];
254 const real_t BBx = Bx * Bx;
255 const real_t BGx = Bx * Gx;
256 const real_t GGx = Gx * Gx;
257 Y(dx,dy,e) += GGx * QD0[qx][dy];
258 Y(dx,dy,e) += BGx * QD1[qx][dy];
259 Y(dx,dy,e) += BBx * QD2[qx][dy];
267template<
int T_D1D = 0,
int T_Q1D = 0>
268inline void PADiffusionDiagonal3D(
const int NE,
269 const bool symmetric,
270 const Array<real_t> &
b,
271 const Array<real_t> &g,
277 constexpr int DIM = 3;
278 const int D1D = T_D1D ? T_D1D : d1d;
279 const int Q1D = T_Q1D ? T_Q1D : q1d;
282 MFEM_VERIFY(D1D <= max_d1d,
"");
283 MFEM_VERIFY(Q1D <= max_q1d,
"");
284 auto B =
Reshape(
b.Read(), Q1D, D1D);
285 auto G =
Reshape(g.Read(), Q1D, D1D);
286 auto Q =
Reshape(d.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
287 auto Y =
Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
290 const int D1D = T_D1D ? T_D1D : d1d;
291 const int Q1D = T_Q1D ? T_Q1D : q1d;
292 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
293 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
294 real_t QQD[MQ1][MQ1][MD1];
295 real_t QDD[MQ1][MD1][MD1];
296 for (
int i = 0; i <
DIM; ++i)
298 for (
int j = 0; j <
DIM; ++j)
301 for (
int qx = 0; qx < Q1D; ++qx)
303 for (
int qy = 0; qy < Q1D; ++qy)
305 for (
int dz = 0; dz < D1D; ++dz)
307 QQD[qx][qy][dz] = 0.0;
308 for (
int qz = 0; qz < Q1D; ++qz)
310 const int q = qx + (qy + qz * Q1D) * Q1D;
311 const int ksym = j >= i ?
312 3 - (3-i)*(2-i)/2 + j:
313 3 - (3-j)*(2-j)/2 + i;
314 const int k = symmetric ? ksym : (i*
DIM) + j;
315 const real_t O = Q(q,k,e);
316 const real_t Bz = B(qz,dz);
317 const real_t Gz = G(qz,dz);
318 const real_t L = i==2 ? Gz : Bz;
319 const real_t R = j==2 ? Gz : Bz;
320 QQD[qx][qy][dz] += L * O * R;
326 for (
int qx = 0; qx < Q1D; ++qx)
328 for (
int dz = 0; dz < D1D; ++dz)
330 for (
int dy = 0; dy < D1D; ++dy)
332 QDD[qx][dy][dz] = 0.0;
333 for (
int qy = 0; qy < Q1D; ++qy)
335 const real_t By = B(qy,dy);
336 const real_t Gy = G(qy,dy);
337 const real_t L = i==1 ? Gy : By;
338 const real_t R = j==1 ? Gy : By;
339 QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
345 for (
int dz = 0; dz < D1D; ++dz)
347 for (
int dy = 0; dy < D1D; ++dy)
349 for (
int dx = 0; dx < D1D; ++dx)
351 for (
int qx = 0; qx < Q1D; ++qx)
353 const real_t Bx = B(qx,dx);
354 const real_t Gx = G(qx,dx);
355 const real_t L = i==0 ? Gx : Bx;
356 const real_t R = j==0 ? Gx : Bx;
357 Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
368template<
int T_D1D = 0,
int T_Q1D = 0>
369inline void SmemPADiffusionDiagonal3D(
const int NE,
370 const bool symmetric,
371 const Array<real_t> &b_,
372 const Array<real_t> &g_,
378 constexpr int DIM = 3;
379 const int D1D = T_D1D ? T_D1D : d1d;
380 const int Q1D = T_Q1D ? T_Q1D : q1d;
383 MFEM_VERIFY(D1D <= max_d1d,
"");
384 MFEM_VERIFY(Q1D <= max_q1d,
"");
385 auto b =
Reshape(b_.Read(), Q1D, D1D);
386 auto g =
Reshape(g_.Read(), Q1D, D1D);
387 auto D =
Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
388 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
391 const int tidz = MFEM_THREAD_ID(z);
392 const int D1D = T_D1D ? T_D1D : d1d;
393 const int Q1D = T_Q1D ? T_Q1D : q1d;
394 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
395 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
396 MFEM_SHARED
real_t BG[2][MQ1*MD1];
399 MFEM_SHARED
real_t QQD[MQ1][MQ1][MD1];
400 MFEM_SHARED
real_t QDD[MQ1][MD1][MD1];
403 MFEM_FOREACH_THREAD(d,y,D1D)
405 MFEM_FOREACH_THREAD(q,x,Q1D)
413 for (
int i = 0; i <
DIM; ++i)
415 for (
int j = 0; j <
DIM; ++j)
418 MFEM_FOREACH_THREAD(qx,x,Q1D)
420 MFEM_FOREACH_THREAD(qy,y,Q1D)
422 MFEM_FOREACH_THREAD(dz,z,D1D)
424 QQD[qx][qy][dz] = 0.0;
425 for (
int qz = 0; qz < Q1D; ++qz)
427 const int q = qx + (qy + qz * Q1D) * Q1D;
428 const int ksym = j >= i ?
429 3 - (3-i)*(2-i)/2 + j:
430 3 - (3-j)*(2-j)/2 + i;
431 const int k = symmetric ? ksym : (i*
DIM) + j;
432 const real_t O = D(q,k,e);
433 const real_t Bz = B[qz][dz];
434 const real_t Gz = G[qz][dz];
435 const real_t L = i==2 ? Gz : Bz;
436 const real_t R = j==2 ? Gz : Bz;
437 QQD[qx][qy][dz] += L * O * R;
444 MFEM_FOREACH_THREAD(qx,x,Q1D)
446 MFEM_FOREACH_THREAD(dz,z,D1D)
448 MFEM_FOREACH_THREAD(dy,y,D1D)
450 QDD[qx][dy][dz] = 0.0;
451 for (
int qy = 0; qy < Q1D; ++qy)
453 const real_t By = B[qy][dy];
454 const real_t Gy = G[qy][dy];
455 const real_t L = i==1 ? Gy : By;
456 const real_t R = j==1 ? Gy : By;
457 QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
464 MFEM_FOREACH_THREAD(dz,z,D1D)
466 MFEM_FOREACH_THREAD(dy,y,D1D)
468 MFEM_FOREACH_THREAD(dx,x,D1D)
470 for (
int qx = 0; qx < Q1D; ++qx)
472 const real_t Bx = B[qx][dx];
473 const real_t Gx = G[qx][dx];
474 const real_t L = i==0 ? Gx : Bx;
475 const real_t R = j==0 ? Gx : Bx;
476 Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
486void PADiffusionApply(
const int dim,
491 const Array<real_t> &B,
492 const Array<real_t> &G,
493 const Array<real_t> &Bt,
494 const Array<real_t> &Gt,
501void OccaPADiffusionApply2D(
const int D1D,
504 const Array<real_t> &B,
505 const Array<real_t> &G,
506 const Array<real_t> &Bt,
507 const Array<real_t> &Gt,
513void OccaPADiffusionApply3D(
const int D1D,
516 const Array<real_t> &B,
517 const Array<real_t> &G,
518 const Array<real_t> &Bt,
519 const Array<real_t> &Gt,
526template<
int T_D1D = 0,
int T_Q1D = 0>
527inline void PADiffusionApply2D(
const int NE,
528 const bool symmetric,
529 const Array<real_t> &b_,
530 const Array<real_t> &g_,
531 const Array<real_t> &bt_,
532 const Array<real_t> >_,
539 const int D1D = T_D1D ? T_D1D : d1d;
540 const int Q1D = T_Q1D ? T_Q1D : q1d;
543 auto B =
Reshape(b_.Read(), Q1D, D1D);
544 auto G =
Reshape(g_.Read(), Q1D, D1D);
545 auto Bt =
Reshape(bt_.Read(), D1D, Q1D);
546 auto Gt =
Reshape(gt_.Read(), D1D, Q1D);
547 auto D =
Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
548 auto X =
Reshape(x_.Read(), D1D, D1D, NE);
549 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
552 const int D1D = T_D1D ? T_D1D : d1d;
553 const int Q1D = T_Q1D ? T_Q1D : q1d;
555 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
556 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
558 real_t grad[max_Q1D][max_Q1D][2];
559 for (
int qy = 0; qy < Q1D; ++qy)
561 for (
int qx = 0; qx < Q1D; ++qx)
563 grad[qy][qx][0] = 0.0;
564 grad[qy][qx][1] = 0.0;
567 for (
int dy = 0; dy < D1D; ++dy)
570 for (
int qx = 0; qx < Q1D; ++qx)
575 for (
int dx = 0; dx < D1D; ++dx)
577 const real_t s = X(dx,dy,e);
578 for (
int qx = 0; qx < Q1D; ++qx)
580 gradX[qx][0] += s * B(qx,dx);
581 gradX[qx][1] += s * G(qx,dx);
584 for (
int qy = 0; qy < Q1D; ++qy)
586 const real_t wy = B(qy,dy);
587 const real_t wDy = G(qy,dy);
588 for (
int qx = 0; qx < Q1D; ++qx)
590 grad[qy][qx][0] += gradX[qx][1] * wy;
591 grad[qy][qx][1] += gradX[qx][0] * wDy;
596 for (
int qy = 0; qy < Q1D; ++qy)
598 for (
int qx = 0; qx < Q1D; ++qx)
600 const int q = qx + qy * Q1D;
602 const real_t O11 = D(q,0,e);
603 const real_t O21 = D(q,1,e);
604 const real_t O12 = symmetric ? O21 : D(q,2,e);
605 const real_t O22 = symmetric ? D(q,2,e) : D(q,3,e);
607 const real_t gradX = grad[qy][qx][0];
608 const real_t gradY = grad[qy][qx][1];
610 grad[qy][qx][0] = (O11 * gradX) + (O12 * gradY);
611 grad[qy][qx][1] = (O21 * gradX) + (O22 * gradY);
614 for (
int qy = 0; qy < Q1D; ++qy)
617 for (
int dx = 0; dx < D1D; ++dx)
622 for (
int qx = 0; qx < Q1D; ++qx)
624 const real_t gX = grad[qy][qx][0];
625 const real_t gY = grad[qy][qx][1];
626 for (
int dx = 0; dx < D1D; ++dx)
628 const real_t wx = Bt(dx,qx);
629 const real_t wDx = Gt(dx,qx);
630 gradX[dx][0] += gX * wDx;
631 gradX[dx][1] += gY * wx;
634 for (
int dy = 0; dy < D1D; ++dy)
636 const real_t wy = Bt(dy,qy);
637 const real_t wDy = Gt(dy,qy);
638 for (
int dx = 0; dx < D1D; ++dx)
640 Y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
648template<
int T_D1D = 0,
int T_Q1D = 0>
649inline void SmemPADiffusionApply2D(
const int NE,
650 const bool symmetric,
651 const Array<real_t> &b_,
652 const Array<real_t> &g_,
653 const Array<real_t> &bt_,
654 const Array<real_t> >_,
661 static constexpr int T_NBZ = diffusion::NBZApply(T_D1D);
662 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
663 const int D1D = T_D1D ? T_D1D : d1d;
664 const int Q1D = T_Q1D ? T_Q1D : q1d;
667 MFEM_VERIFY(D1D <= max_d1d,
"");
668 MFEM_VERIFY(Q1D <= max_q1d,
"");
669 auto b =
Reshape(b_.Read(), Q1D, D1D);
670 auto g =
Reshape(g_.Read(), Q1D, D1D);
671 auto D =
Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
672 auto x =
Reshape(x_.Read(), D1D, D1D, NE);
673 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
676 const int tidz = MFEM_THREAD_ID(z);
677 const int D1D = T_D1D ? T_D1D : d1d;
678 const int Q1D = T_Q1D ? T_Q1D : q1d;
679 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
680 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
681 MFEM_SHARED
real_t sBG[2][MQ1*MD1];
686 MFEM_SHARED
real_t Xz[NBZ][MD1][MD1];
687 MFEM_SHARED
real_t GD[2][NBZ][MD1][MQ1];
688 MFEM_SHARED
real_t GQ[2][NBZ][MD1][MQ1];
694 MFEM_FOREACH_THREAD(dy,y,D1D)
696 MFEM_FOREACH_THREAD(dx,x,D1D)
698 X[dy][dx] = x(dx,dy,e);
703 MFEM_FOREACH_THREAD(dy,y,D1D)
705 MFEM_FOREACH_THREAD(q,x,Q1D)
713 MFEM_FOREACH_THREAD(dy,y,D1D)
715 MFEM_FOREACH_THREAD(qx,x,Q1D)
719 for (
int dx = 0; dx < D1D; ++dx)
721 const real_t coords = X[dy][dx];
722 u += B[qx][dx] * coords;
723 v += G[qx][dx] * coords;
730 MFEM_FOREACH_THREAD(qy,y,Q1D)
732 MFEM_FOREACH_THREAD(qx,x,Q1D)
736 for (
int dy = 0; dy < D1D; ++dy)
738 u += DQ1[dy][qx] * B[qy][dy];
739 v += DQ0[dy][qx] * G[qy][dy];
746 MFEM_FOREACH_THREAD(qy,y,Q1D)
748 MFEM_FOREACH_THREAD(qx,x,Q1D)
750 const int q = (qx + ((qy) * Q1D));
751 const real_t O11 = D(q,0,e);
752 const real_t O21 = D(q,1,e);
753 const real_t O12 = symmetric ? O21 : D(q,2,e);
754 const real_t O22 = symmetric ? D(q,2,e) : D(q,3,e);
755 const real_t gX = QQ0[qy][qx];
756 const real_t gY = QQ1[qy][qx];
757 QQ0[qy][qx] = (O11 * gX) + (O12 * gY);
758 QQ1[qy][qx] = (O21 * gX) + (O22 * gY);
764 MFEM_FOREACH_THREAD(dy,y,D1D)
766 MFEM_FOREACH_THREAD(q,x,Q1D)
774 MFEM_FOREACH_THREAD(qy,y,Q1D)
776 MFEM_FOREACH_THREAD(dx,x,D1D)
780 for (
int qx = 0; qx < Q1D; ++qx)
782 u += Gt[dx][qx] * QQ0[qy][qx];
783 v += Bt[dx][qx] * QQ1[qy][qx];
790 MFEM_FOREACH_THREAD(dy,y,D1D)
792 MFEM_FOREACH_THREAD(dx,x,D1D)
796 for (
int qy = 0; qy < Q1D; ++qy)
798 u += DQ0[qy][dx] * Bt[dy][qy];
799 v += DQ1[qy][dx] * Gt[dy][qy];
801 Y(dx,dy,e) += (
u + v);
808template<
int T_D1D = 0,
int T_Q1D = 0>
809inline void PADiffusionApply3D(
const int NE,
810 const bool symmetric,
811 const Array<real_t> &
b,
812 const Array<real_t> &g,
813 const Array<real_t> &bt,
814 const Array<real_t> >,
818 int d1d = 0,
int q1d = 0)
820 const int D1D = T_D1D ? T_D1D : d1d;
821 const int Q1D = T_Q1D ? T_Q1D : q1d;
824 auto B =
Reshape(
b.Read(), Q1D, D1D);
825 auto G =
Reshape(g.Read(), Q1D, D1D);
826 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
827 auto Gt =
Reshape(gt.Read(), D1D, Q1D);
828 auto D =
Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
829 auto X =
Reshape(x_.Read(), D1D, D1D, D1D, NE);
830 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
833 const int D1D = T_D1D ? T_D1D : d1d;
834 const int Q1D = T_Q1D ? T_Q1D : q1d;
835 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
836 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
837 real_t grad[max_Q1D][max_Q1D][max_Q1D][3];
838 for (
int qz = 0; qz < Q1D; ++qz)
840 for (
int qy = 0; qy < Q1D; ++qy)
842 for (
int qx = 0; qx < Q1D; ++qx)
844 grad[qz][qy][qx][0] = 0.0;
845 grad[qz][qy][qx][1] = 0.0;
846 grad[qz][qy][qx][2] = 0.0;
850 for (
int dz = 0; dz < D1D; ++dz)
852 real_t gradXY[max_Q1D][max_Q1D][3];
853 for (
int qy = 0; qy < Q1D; ++qy)
855 for (
int qx = 0; qx < Q1D; ++qx)
857 gradXY[qy][qx][0] = 0.0;
858 gradXY[qy][qx][1] = 0.0;
859 gradXY[qy][qx][2] = 0.0;
862 for (
int dy = 0; dy < D1D; ++dy)
865 for (
int qx = 0; qx < Q1D; ++qx)
870 for (
int dx = 0; dx < D1D; ++dx)
872 const real_t s = X(dx,dy,dz,e);
873 for (
int qx = 0; qx < Q1D; ++qx)
875 gradX[qx][0] += s * B(qx,dx);
876 gradX[qx][1] += s * G(qx,dx);
879 for (
int qy = 0; qy < Q1D; ++qy)
881 const real_t wy = B(qy,dy);
882 const real_t wDy = G(qy,dy);
883 for (
int qx = 0; qx < Q1D; ++qx)
885 const real_t wx = gradX[qx][0];
886 const real_t wDx = gradX[qx][1];
887 gradXY[qy][qx][0] += wDx * wy;
888 gradXY[qy][qx][1] += wx * wDy;
889 gradXY[qy][qx][2] += wx * wy;
893 for (
int qz = 0; qz < Q1D; ++qz)
895 const real_t wz = B(qz,dz);
896 const real_t wDz = G(qz,dz);
897 for (
int qy = 0; qy < Q1D; ++qy)
899 for (
int qx = 0; qx < Q1D; ++qx)
901 grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
902 grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
903 grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
909 for (
int qz = 0; qz < Q1D; ++qz)
911 for (
int qy = 0; qy < Q1D; ++qy)
913 for (
int qx = 0; qx < Q1D; ++qx)
915 const int q = qx + (qy + qz * Q1D) * Q1D;
916 const real_t O11 = D(q,0,e);
917 const real_t O12 = D(q,1,e);
918 const real_t O13 = D(q,2,e);
919 const real_t O21 = symmetric ? O12 : D(q,3,e);
920 const real_t O22 = symmetric ? D(q,3,e) : D(q,4,e);
921 const real_t O23 = symmetric ? D(q,4,e) : D(q,5,e);
922 const real_t O31 = symmetric ? O13 : D(q,6,e);
923 const real_t O32 = symmetric ? O23 : D(q,7,e);
924 const real_t O33 = symmetric ? D(q,5,e) : D(q,8,e);
925 const real_t gradX = grad[qz][qy][qx][0];
926 const real_t gradY = grad[qz][qy][qx][1];
927 const real_t gradZ = grad[qz][qy][qx][2];
928 grad[qz][qy][qx][0] = (O11*gradX)+(O12*gradY)+(O13*gradZ);
929 grad[qz][qy][qx][1] = (O21*gradX)+(O22*gradY)+(O23*gradZ);
930 grad[qz][qy][qx][2] = (O31*gradX)+(O32*gradY)+(O33*gradZ);
934 for (
int qz = 0; qz < Q1D; ++qz)
936 real_t gradXY[max_D1D][max_D1D][3];
937 for (
int dy = 0; dy < D1D; ++dy)
939 for (
int dx = 0; dx < D1D; ++dx)
941 gradXY[dy][dx][0] = 0;
942 gradXY[dy][dx][1] = 0;
943 gradXY[dy][dx][2] = 0;
946 for (
int qy = 0; qy < Q1D; ++qy)
949 for (
int dx = 0; dx < D1D; ++dx)
955 for (
int qx = 0; qx < Q1D; ++qx)
957 const real_t gX = grad[qz][qy][qx][0];
958 const real_t gY = grad[qz][qy][qx][1];
959 const real_t gZ = grad[qz][qy][qx][2];
960 for (
int dx = 0; dx < D1D; ++dx)
962 const real_t wx = Bt(dx,qx);
963 const real_t wDx = Gt(dx,qx);
964 gradX[dx][0] += gX * wDx;
965 gradX[dx][1] += gY * wx;
966 gradX[dx][2] += gZ * wx;
969 for (
int dy = 0; dy < D1D; ++dy)
971 const real_t wy = Bt(dy,qy);
972 const real_t wDy = Gt(dy,qy);
973 for (
int dx = 0; dx < D1D; ++dx)
975 gradXY[dy][dx][0] += gradX[dx][0] * wy;
976 gradXY[dy][dx][1] += gradX[dx][1] * wDy;
977 gradXY[dy][dx][2] += gradX[dx][2] * wy;
981 for (
int dz = 0; dz < D1D; ++dz)
983 const real_t wz = Bt(dz,qz);
984 const real_t wDz = Gt(dz,qz);
985 for (
int dy = 0; dy < D1D; ++dy)
987 for (
int dx = 0; dx < D1D; ++dx)
990 ((gradXY[dy][dx][0] * wz) +
991 (gradXY[dy][dx][1] * wz) +
992 (gradXY[dy][dx][2] * wDz));
1001template<
int T_D1D = 0,
int T_Q1D = 0>
1002inline void SmemPADiffusionApply3D(
const int NE,
1003 const bool symmetric,
1004 const Array<real_t> &b_,
1005 const Array<real_t> &g_,
1006 const Array<real_t> &,
1007 const Array<real_t> &,
1014 const int D1D = T_D1D ? T_D1D : d1d;
1015 const int Q1D = T_Q1D ? T_Q1D : q1d;
1018 MFEM_VERIFY(D1D <= max_d1d,
"");
1019 MFEM_VERIFY(Q1D <= max_q1d,
"");
1020 auto b =
Reshape(b_.Read(), Q1D, D1D);
1021 auto g =
Reshape(g_.Read(), Q1D, D1D);
1022 auto d =
Reshape(d_.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
1023 auto x =
Reshape(x_.Read(), D1D, D1D, D1D, NE);
1024 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1027 const int D1D = T_D1D ? T_D1D : d1d;
1028 const int Q1D = T_Q1D ? T_Q1D : q1d;
1029 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1030 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1031 constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
1032 MFEM_SHARED
real_t sBG[2][MQ1*MD1];
1037 MFEM_SHARED
real_t sm0[3][MDQ*MDQ*MDQ];
1038 MFEM_SHARED
real_t sm1[3][MDQ*MDQ*MDQ];
1039 real_t (*X)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+2);
1040 real_t (*DDQ0)[MD1][MQ1] = (
real_t (*)[MD1][MQ1]) (sm0+0);
1041 real_t (*DDQ1)[MD1][MQ1] = (
real_t (*)[MD1][MQ1]) (sm0+1);
1042 real_t (*DQQ0)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+0);
1043 real_t (*DQQ1)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+1);
1044 real_t (*DQQ2)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+2);
1045 real_t (*QQQ0)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+0);
1046 real_t (*QQQ1)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+1);
1047 real_t (*QQQ2)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+2);
1048 real_t (*QQD0)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+0);
1049 real_t (*QQD1)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+1);
1050 real_t (*QQD2)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+2);
1051 real_t (*QDD0)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+0);
1052 real_t (*QDD1)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+1);
1053 real_t (*QDD2)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+2);
1054 MFEM_FOREACH_THREAD(dz,z,D1D)
1056 MFEM_FOREACH_THREAD(dy,y,D1D)
1058 MFEM_FOREACH_THREAD(dx,x,D1D)
1060 X[dz][dy][dx] = x(dx,dy,dz,e);
1064 if (MFEM_THREAD_ID(z) == 0)
1066 MFEM_FOREACH_THREAD(dy,y,D1D)
1068 MFEM_FOREACH_THREAD(qx,x,Q1D)
1070 B[qx][dy] =
b(qx,dy);
1071 G[qx][dy] = g(qx,dy);
1076 MFEM_FOREACH_THREAD(dz,z,D1D)
1078 MFEM_FOREACH_THREAD(dy,y,D1D)
1080 MFEM_FOREACH_THREAD(qx,x,Q1D)
1084 for (
int dx = 0; dx < D1D; ++dx)
1086 const real_t coords = X[dz][dy][dx];
1087 u += coords * B[qx][dx];
1088 v += coords * G[qx][dx];
1090 DDQ0[dz][dy][qx] =
u;
1091 DDQ1[dz][dy][qx] = v;
1096 MFEM_FOREACH_THREAD(dz,z,D1D)
1098 MFEM_FOREACH_THREAD(qy,y,Q1D)
1100 MFEM_FOREACH_THREAD(qx,x,Q1D)
1102 real_t u = 0.0, v = 0.0, w = 0.0;
1104 for (
int dy = 0; dy < D1D; ++dy)
1106 u += DDQ1[dz][dy][qx] * B[qy][dy];
1107 v += DDQ0[dz][dy][qx] * G[qy][dy];
1108 w += DDQ0[dz][dy][qx] * B[qy][dy];
1110 DQQ0[dz][qy][qx] =
u;
1111 DQQ1[dz][qy][qx] = v;
1112 DQQ2[dz][qy][qx] = w;
1117 MFEM_FOREACH_THREAD(qz,z,Q1D)
1119 MFEM_FOREACH_THREAD(qy,y,Q1D)
1121 MFEM_FOREACH_THREAD(qx,x,Q1D)
1123 real_t u = 0.0, v = 0.0, w = 0.0;
1125 for (
int dz = 0; dz < D1D; ++dz)
1127 u += DQQ0[dz][qy][qx] * B[qz][dz];
1128 v += DQQ1[dz][qy][qx] * B[qz][dz];
1129 w += DQQ2[dz][qy][qx] * G[qz][dz];
1131 const real_t O11 = d(qx,qy,qz,0,e);
1132 const real_t O12 = d(qx,qy,qz,1,e);
1133 const real_t O13 = d(qx,qy,qz,2,e);
1134 const real_t O21 = symmetric ? O12 : d(qx,qy,qz,3,e);
1135 const real_t O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e);
1136 const real_t O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e);
1137 const real_t O31 = symmetric ? O13 : d(qx,qy,qz,6,e);
1138 const real_t O32 = symmetric ? O23 : d(qx,qy,qz,7,e);
1139 const real_t O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e);
1143 QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ);
1144 QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ);
1145 QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ);
1150 if (MFEM_THREAD_ID(z) == 0)
1152 MFEM_FOREACH_THREAD(dy,y,D1D)
1154 MFEM_FOREACH_THREAD(qx,x,Q1D)
1156 Bt[dy][qx] =
b(qx,dy);
1157 Gt[dy][qx] = g(qx,dy);
1162 MFEM_FOREACH_THREAD(qz,z,Q1D)
1164 MFEM_FOREACH_THREAD(qy,y,Q1D)
1166 MFEM_FOREACH_THREAD(dx,x,D1D)
1168 real_t u = 0.0, v = 0.0, w = 0.0;
1170 for (
int qx = 0; qx < Q1D; ++qx)
1172 u += QQQ0[qz][qy][qx] * Gt[dx][qx];
1173 v += QQQ1[qz][qy][qx] * Bt[dx][qx];
1174 w += QQQ2[qz][qy][qx] * Bt[dx][qx];
1176 QQD0[qz][qy][dx] =
u;
1177 QQD1[qz][qy][dx] = v;
1178 QQD2[qz][qy][dx] = w;
1183 MFEM_FOREACH_THREAD(qz,z,Q1D)
1185 MFEM_FOREACH_THREAD(dy,y,D1D)
1187 MFEM_FOREACH_THREAD(dx,x,D1D)
1189 real_t u = 0.0, v = 0.0, w = 0.0;
1191 for (
int qy = 0; qy < Q1D; ++qy)
1193 u += QQD0[qz][qy][dx] * Bt[dy][qy];
1194 v += QQD1[qz][qy][dx] * Gt[dy][qy];
1195 w += QQD2[qz][qy][dx] * Bt[dy][qy];
1197 QDD0[qz][dy][dx] =
u;
1198 QDD1[qz][dy][dx] = v;
1199 QDD2[qz][dy][dx] = w;
1204 MFEM_FOREACH_THREAD(dz,z,D1D)
1206 MFEM_FOREACH_THREAD(dy,y,D1D)
1208 MFEM_FOREACH_THREAD(dx,x,D1D)
1210 real_t u = 0.0, v = 0.0, w = 0.0;
1212 for (
int qz = 0; qz < Q1D; ++qz)
1214 u += QDD0[qz][dy][dx] * Bt[dz][qz];
1215 v += QDD1[qz][dy][dx] * Bt[dz][qz];
1216 w += QDD2[qz][dy][dx] * Gt[dz][qz];
1218 y(dx,dy,dz,e) += (
u + v + w);
1233template<
int DIM,
int T_D1D,
int T_Q1D>
1234ApplyKernelType DiffusionIntegrator::ApplyPAKernels::Kernel()
1236 if (
DIM == 2) {
return internal::SmemPADiffusionApply2D<T_D1D,T_Q1D>; }
1237 else if (
DIM == 3) {
return internal::SmemPADiffusionApply3D<T_D1D, T_Q1D>; }
1238 else { MFEM_ABORT(
""); }
1242ApplyKernelType DiffusionIntegrator::ApplyPAKernels::Fallback(
int DIM,
int,
int)
1244 if (
DIM == 2) {
return internal::PADiffusionApply2D; }
1245 else if (
DIM == 3) {
return internal::PADiffusionApply3D; }
1246 else { MFEM_ABORT(
""); }
1249template<
int DIM,
int D1D,
int Q1D>
1250DiagonalKernelType DiffusionIntegrator::DiagonalPAKernels::Kernel()
1252 if (
DIM == 2) {
return internal::SmemPADiffusionDiagonal2D<D1D,Q1D>; }
1253 else if (
DIM == 3) {
return internal::SmemPADiffusionDiagonal3D<D1D, Q1D>; }
1254 else { MFEM_ABORT(
""); }
1257inline DiagonalKernelType
1258DiffusionIntegrator::DiagonalPAKernels::Fallback(
int DIM,
int,
int)
1260 if (
DIM == 2) {
return internal::PADiffusionDiagonal2D; }
1261 else if (
DIM == 3) {
return internal::PADiffusionDiagonal3D; }
1262 else { MFEM_ABORT(
""); }
void(*)(const int, const bool, const Array< real_t > &, const Array< real_t > &, const Vector &, Vector &, const int, const int) DiagonalKernelType
void(*)(const int, const bool, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
real_t u(const Vector &xvec)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
void forall(int N, lambda &&body)
real_t p(const Vector &x, real_t t)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
int MAX_D1D
Maximum number of 1D nodal points.
int MAX_Q1D
Maximum number of 1D quadrature points.