12#ifndef MFEM_BILININTEG_DIFFUSION_KERNELS_HPP
13#define MFEM_BILININTEG_DIFFUSION_KERNELS_HPP
30void PADiffusionSetup(
const int dim,
36 const Array<real_t> &W,
43void PADiffusionSetup2D(
const int Q1D,
46 const Array<real_t> &w,
52void PADiffusionSetup3D(
const int Q1D,
55 const Array<real_t> &w,
62void OccaPADiffusionSetup2D(
const int D1D,
65 const Array<real_t> &W,
71void OccaPADiffusionSetup3D(
const int D1D,
74 const Array<real_t> &W,
80void PADiffusionAssembleDiagonal(
const int dim,
85 const Array<real_t> &B,
86 const Array<real_t> &G,
91template<
int T_D1D = 0,
int T_Q1D = 0>
92inline void PADiffusionDiagonal2D(
const int NE,
94 const Array<real_t> &
b,
95 const Array<real_t> &g,
101 const int D1D = T_D1D ? T_D1D : d1d;
102 const int Q1D = T_Q1D ? T_Q1D : q1d;
105 auto B =
Reshape(
b.Read(), Q1D, D1D);
106 auto G =
Reshape(g.Read(), Q1D, D1D);
109 auto D =
Reshape(d.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
110 auto Y =
Reshape(y.ReadWrite(), D1D, D1D, NE);
113 const int D1D = T_D1D ? T_D1D : d1d;
114 const int Q1D = T_Q1D ? T_Q1D : q1d;
115 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
116 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
121 for (
int qx = 0; qx < Q1D; ++qx)
123 for (
int dy = 0; dy < D1D; ++dy)
128 for (
int qy = 0; qy < Q1D; ++qy)
130 const int q = qx + qy * Q1D;
131 const real_t D00 = D(q,0,e);
132 const real_t D10 = D(q,1,e);
133 const real_t D01 = symmetric ? D10 : D(q,2,e);
134 const real_t D11 = symmetric ? D(q,2,e) : D(q,3,e);
135 QD0[qx][dy] += B(qy, dy) * B(qy, dy) * D00;
136 QD1[qx][dy] += B(qy, dy) * G(qy, dy) * (D01 + D10);
137 QD2[qx][dy] += G(qy, dy) * G(qy, dy) * D11;
141 for (
int dy = 0; dy < D1D; ++dy)
143 for (
int dx = 0; dx < D1D; ++dx)
145 for (
int qx = 0; qx < Q1D; ++qx)
147 Y(dx,dy,e) += G(qx, dx) * G(qx, dx) * QD0[qx][dy];
148 Y(dx,dy,e) += G(qx, dx) * B(qx, dx) * QD1[qx][dy];
149 Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD2[qx][dy];
158constexpr int ipow(
int x,
int p) {
return p == 0 ? 1 : x*ipow(x,
p-1); }
159constexpr int D11(
int x) {
return (11 - x)/2; }
160constexpr int D10(
int x) {
return (10 - x)/2; }
161constexpr int NBZApply(
int D1D)
163 return ipow(2, D11(D1D) >= 0 ? D11(D1D) : 0);
165constexpr int NBZDiagonal(
int D1D)
167 return ipow(2, D10(D1D) >= 0 ? D10(D1D) : 0);
172template<
int T_D1D = 0,
int T_Q1D = 0>
173inline void SmemPADiffusionDiagonal2D(
const int NE,
174 const bool symmetric,
175 const Array<real_t> &b_,
176 const Array<real_t> &g_,
182 static constexpr int T_NBZ = diffusion::NBZDiagonal(T_D1D);
183 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
184 const int D1D = T_D1D ? T_D1D : d1d;
185 const int Q1D = T_Q1D ? T_Q1D : q1d;
188 MFEM_VERIFY(D1D <= max_d1d,
"");
189 MFEM_VERIFY(Q1D <= max_q1d,
"");
190 auto b =
Reshape(b_.Read(), Q1D, D1D);
191 auto g =
Reshape(g_.Read(), Q1D, D1D);
192 auto D =
Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
193 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
196 const int tidz = MFEM_THREAD_ID(z);
197 const int D1D = T_D1D ? T_D1D : d1d;
198 const int Q1D = T_Q1D ? T_Q1D : q1d;
199 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
200 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
201 MFEM_SHARED
real_t BG[2][MQ1*MD1];
204 MFEM_SHARED
real_t QD[3][NBZ][MQ1][MD1];
210 MFEM_FOREACH_THREAD(d,y,D1D)
212 MFEM_FOREACH_THREAD(q,x,Q1D)
220 MFEM_FOREACH_THREAD(qx,x,Q1D)
222 MFEM_FOREACH_THREAD(dy,y,D1D)
227 for (
int qy = 0; qy < Q1D; ++qy)
229 const int q = qx + qy * Q1D;
230 const real_t D00 = D(q,0,e);
231 const real_t D10 = D(q,1,e);
232 const real_t D01 = symmetric ? D10 : D(q,2,e);
233 const real_t D11 = symmetric ? D(q,2,e) : D(q,3,e);
234 const real_t By = B[qy][dy];
235 const real_t Gy = G[qy][dy];
236 const real_t BBy = By * By;
237 const real_t BGy = By * Gy;
238 const real_t GGy = Gy * Gy;
239 QD0[qx][dy] += BBy * D00;
240 QD1[qx][dy] += BGy * (D01 + D10);
241 QD2[qx][dy] += GGy * D11;
246 MFEM_FOREACH_THREAD(dy,y,D1D)
248 MFEM_FOREACH_THREAD(dx,x,D1D)
250 for (
int qx = 0; qx < Q1D; ++qx)
252 const real_t Bx = B[qx][dx];
253 const real_t Gx = G[qx][dx];
254 const real_t BBx = Bx * Bx;
255 const real_t BGx = Bx * Gx;
256 const real_t GGx = Gx * Gx;
257 Y(dx,dy,e) += GGx * QD0[qx][dy];
258 Y(dx,dy,e) += BGx * QD1[qx][dy];
259 Y(dx,dy,e) += BBx * QD2[qx][dy];
267template<
int T_D1D = 0,
int T_Q1D = 0>
268inline void PADiffusionDiagonal3D(
const int NE,
269 const bool symmetric,
270 const Array<real_t> &
b,
271 const Array<real_t> &g,
277 constexpr int DIM = 3;
278 const int D1D = T_D1D ? T_D1D : d1d;
279 const int Q1D = T_Q1D ? T_Q1D : q1d;
282 MFEM_VERIFY(D1D <= max_d1d,
"");
283 MFEM_VERIFY(Q1D <= max_q1d,
"");
284 auto B =
Reshape(
b.Read(), Q1D, D1D);
285 auto G =
Reshape(g.Read(), Q1D, D1D);
286 auto Q =
Reshape(d.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
287 auto Y =
Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
290 const int D1D = T_D1D ? T_D1D : d1d;
291 const int Q1D = T_Q1D ? T_Q1D : q1d;
292 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
293 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
294 real_t QQD[MQ1][MQ1][MD1];
295 real_t QDD[MQ1][MD1][MD1];
296 for (
int i = 0; i <
DIM; ++i)
298 for (
int j = 0; j <
DIM; ++j)
301 for (
int qx = 0; qx < Q1D; ++qx)
303 for (
int qy = 0; qy < Q1D; ++qy)
305 for (
int dz = 0; dz < D1D; ++dz)
307 QQD[qx][qy][dz] = 0.0;
308 for (
int qz = 0; qz < Q1D; ++qz)
310 const int q = qx + (qy + qz * Q1D) * Q1D;
311 const int ksym = j >= i ?
312 3 - (3-i)*(2-i)/2 + j:
313 3 - (3-j)*(2-j)/2 + i;
314 const int k = symmetric ? ksym : (i*
DIM) + j;
315 const real_t O = Q(q,k,e);
316 const real_t Bz = B(qz,dz);
317 const real_t Gz = G(qz,dz);
318 const real_t L = i==2 ? Gz : Bz;
319 const real_t R = j==2 ? Gz : Bz;
320 QQD[qx][qy][dz] += L * O * R;
326 for (
int qx = 0; qx < Q1D; ++qx)
328 for (
int dz = 0; dz < D1D; ++dz)
330 for (
int dy = 0; dy < D1D; ++dy)
332 QDD[qx][dy][dz] = 0.0;
333 for (
int qy = 0; qy < Q1D; ++qy)
335 const real_t By = B(qy,dy);
336 const real_t Gy = G(qy,dy);
337 const real_t L = i==1 ? Gy : By;
338 const real_t R = j==1 ? Gy : By;
339 QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
345 for (
int dz = 0; dz < D1D; ++dz)
347 for (
int dy = 0; dy < D1D; ++dy)
349 for (
int dx = 0; dx < D1D; ++dx)
351 for (
int qx = 0; qx < Q1D; ++qx)
353 const real_t Bx = B(qx,dx);
354 const real_t Gx = G(qx,dx);
355 const real_t L = i==0 ? Gx : Bx;
356 const real_t R = j==0 ? Gx : Bx;
357 Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
368template<
int T_D1D = 0,
int T_Q1D = 0>
369inline void SmemPADiffusionDiagonal3D(
const int NE,
370 const bool symmetric,
371 const Array<real_t> &b_,
372 const Array<real_t> &g_,
378 constexpr int DIM = 3;
379 const int D1D = T_D1D ? T_D1D : d1d;
380 const int Q1D = T_Q1D ? T_Q1D : q1d;
383 MFEM_VERIFY(D1D <= max_d1d,
"");
384 MFEM_VERIFY(Q1D <= max_q1d,
"");
385 auto b =
Reshape(b_.Read(), Q1D, D1D);
386 auto g =
Reshape(g_.Read(), Q1D, D1D);
387 auto D =
Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
388 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
391 const int tidz = MFEM_THREAD_ID(z);
392 const int D1D = T_D1D ? T_D1D : d1d;
393 const int Q1D = T_Q1D ? T_Q1D : q1d;
394 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
395 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
396 MFEM_SHARED
real_t BG[2][MQ1*MD1];
399 MFEM_SHARED
real_t QQD[MQ1][MQ1][MD1];
400 MFEM_SHARED
real_t QDD[MQ1][MD1][MD1];
403 MFEM_FOREACH_THREAD(d,y,D1D)
405 MFEM_FOREACH_THREAD(q,x,Q1D)
413 for (
int i = 0; i <
DIM; ++i)
415 for (
int j = 0; j <
DIM; ++j)
418 MFEM_FOREACH_THREAD(qx,x,Q1D)
420 MFEM_FOREACH_THREAD(qy,y,Q1D)
422 MFEM_FOREACH_THREAD(dz,z,D1D)
424 QQD[qx][qy][dz] = 0.0;
425 for (
int qz = 0; qz < Q1D; ++qz)
427 const int q = qx + (qy + qz * Q1D) * Q1D;
428 const int ksym = j >= i ?
429 3 - (3-i)*(2-i)/2 + j:
430 3 - (3-j)*(2-j)/2 + i;
431 const int k = symmetric ? ksym : (i*
DIM) + j;
432 const real_t O = D(q,k,e);
433 const real_t Bz = B[qz][dz];
434 const real_t Gz = G[qz][dz];
435 const real_t L = i==2 ? Gz : Bz;
436 const real_t R = j==2 ? Gz : Bz;
437 QQD[qx][qy][dz] += L * O * R;
444 MFEM_FOREACH_THREAD(qx,x,Q1D)
446 MFEM_FOREACH_THREAD(dz,z,D1D)
448 MFEM_FOREACH_THREAD(dy,y,D1D)
450 QDD[qx][dy][dz] = 0.0;
451 for (
int qy = 0; qy < Q1D; ++qy)
453 const real_t By = B[qy][dy];
454 const real_t Gy = G[qy][dy];
455 const real_t L = i==1 ? Gy : By;
456 const real_t R = j==1 ? Gy : By;
457 QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
464 MFEM_FOREACH_THREAD(dz,z,D1D)
466 MFEM_FOREACH_THREAD(dy,y,D1D)
468 MFEM_FOREACH_THREAD(dx,x,D1D)
470 for (
int qx = 0; qx < Q1D; ++qx)
472 const real_t Bx = B[qx][dx];
473 const real_t Gx = G[qx][dx];
474 const real_t L = i==0 ? Gx : Bx;
475 const real_t R = j==0 ? Gx : Bx;
476 Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
488void OccaPADiffusionApply2D(
const int D1D,
491 const Array<real_t> &B,
492 const Array<real_t> &G,
493 const Array<real_t> &Bt,
494 const Array<real_t> &Gt,
500void OccaPADiffusionApply3D(
const int D1D,
503 const Array<real_t> &B,
504 const Array<real_t> &G,
505 const Array<real_t> &Bt,
506 const Array<real_t> &Gt,
513template<
int T_D1D = 0,
int T_Q1D = 0>
514inline void PADiffusionApply2D(
const int NE,
515 const bool symmetric,
516 const Array<real_t> &b_,
517 const Array<real_t> &g_,
518 const Array<real_t> &bt_,
519 const Array<real_t> >_,
526 const int D1D = T_D1D ? T_D1D : d1d;
527 const int Q1D = T_Q1D ? T_Q1D : q1d;
530 auto B =
Reshape(b_.Read(), Q1D, D1D);
531 auto G =
Reshape(g_.Read(), Q1D, D1D);
532 auto Bt =
Reshape(bt_.Read(), D1D, Q1D);
533 auto Gt =
Reshape(gt_.Read(), D1D, Q1D);
534 auto D =
Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
535 auto X =
Reshape(x_.Read(), D1D, D1D, NE);
536 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
539 const int D1D = T_D1D ? T_D1D : d1d;
540 const int Q1D = T_Q1D ? T_Q1D : q1d;
542 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
543 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
545 real_t grad[max_Q1D][max_Q1D][2];
546 for (
int qy = 0; qy < Q1D; ++qy)
548 for (
int qx = 0; qx < Q1D; ++qx)
550 grad[qy][qx][0] = 0.0;
551 grad[qy][qx][1] = 0.0;
554 for (
int dy = 0; dy < D1D; ++dy)
557 for (
int qx = 0; qx < Q1D; ++qx)
562 for (
int dx = 0; dx < D1D; ++dx)
564 const real_t s = X(dx,dy,e);
565 for (
int qx = 0; qx < Q1D; ++qx)
567 gradX[qx][0] += s * B(qx,dx);
568 gradX[qx][1] += s * G(qx,dx);
571 for (
int qy = 0; qy < Q1D; ++qy)
573 const real_t wy = B(qy,dy);
574 const real_t wDy = G(qy,dy);
575 for (
int qx = 0; qx < Q1D; ++qx)
577 grad[qy][qx][0] += gradX[qx][1] * wy;
578 grad[qy][qx][1] += gradX[qx][0] * wDy;
583 for (
int qy = 0; qy < Q1D; ++qy)
585 for (
int qx = 0; qx < Q1D; ++qx)
587 const int q = qx + qy * Q1D;
589 const real_t O11 = D(q,0,e);
590 const real_t O21 = D(q,1,e);
591 const real_t O12 = symmetric ? O21 : D(q,2,e);
592 const real_t O22 = symmetric ? D(q,2,e) : D(q,3,e);
594 const real_t gradX = grad[qy][qx][0];
595 const real_t gradY = grad[qy][qx][1];
597 grad[qy][qx][0] = (O11 * gradX) + (O12 * gradY);
598 grad[qy][qx][1] = (O21 * gradX) + (O22 * gradY);
601 for (
int qy = 0; qy < Q1D; ++qy)
604 for (
int dx = 0; dx < D1D; ++dx)
609 for (
int qx = 0; qx < Q1D; ++qx)
611 const real_t gX = grad[qy][qx][0];
612 const real_t gY = grad[qy][qx][1];
613 for (
int dx = 0; dx < D1D; ++dx)
615 const real_t wx = Bt(dx,qx);
616 const real_t wDx = Gt(dx,qx);
617 gradX[dx][0] += gX * wDx;
618 gradX[dx][1] += gY * wx;
621 for (
int dy = 0; dy < D1D; ++dy)
623 const real_t wy = Bt(dy,qy);
624 const real_t wDy = Gt(dy,qy);
625 for (
int dx = 0; dx < D1D; ++dx)
627 Y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
635template<
int T_D1D = 0,
int T_Q1D = 0>
636inline void SmemPADiffusionApply2D(
const int NE,
637 const bool symmetric,
638 const Array<real_t> &b_,
639 const Array<real_t> &g_,
640 const Array<real_t> &bt_,
641 const Array<real_t> >_,
648 static constexpr int T_NBZ = diffusion::NBZApply(T_D1D);
649 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
650 const int D1D = T_D1D ? T_D1D : d1d;
651 const int Q1D = T_Q1D ? T_Q1D : q1d;
654 MFEM_VERIFY(D1D <= max_d1d,
"");
655 MFEM_VERIFY(Q1D <= max_q1d,
"");
656 auto b =
Reshape(b_.Read(), Q1D, D1D);
657 auto g =
Reshape(g_.Read(), Q1D, D1D);
658 auto D =
Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
659 auto x =
Reshape(x_.Read(), D1D, D1D, NE);
660 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
663 const int tidz = MFEM_THREAD_ID(z);
664 const int D1D = T_D1D ? T_D1D : d1d;
665 const int Q1D = T_Q1D ? T_Q1D : q1d;
666 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
667 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
668 MFEM_SHARED
real_t sBG[2][MQ1*MD1];
673 MFEM_SHARED
real_t Xz[NBZ][MD1][MD1];
674 MFEM_SHARED
real_t GD[2][NBZ][MD1][MQ1];
675 MFEM_SHARED
real_t GQ[2][NBZ][MQ1][MQ1];
681 MFEM_FOREACH_THREAD(dy,y,D1D)
683 MFEM_FOREACH_THREAD(dx,x,D1D)
685 X[dy][dx] = x(dx,dy,e);
690 MFEM_FOREACH_THREAD(dy,y,D1D)
692 MFEM_FOREACH_THREAD(q,x,Q1D)
700 MFEM_FOREACH_THREAD(dy,y,D1D)
702 MFEM_FOREACH_THREAD(qx,x,Q1D)
706 for (
int dx = 0; dx < D1D; ++dx)
708 const real_t coords = X[dy][dx];
709 u += B[qx][dx] * coords;
710 v += G[qx][dx] * coords;
717 MFEM_FOREACH_THREAD(qy,y,Q1D)
719 MFEM_FOREACH_THREAD(qx,x,Q1D)
723 for (
int dy = 0; dy < D1D; ++dy)
725 u += DQ1[dy][qx] * B[qy][dy];
726 v += DQ0[dy][qx] * G[qy][dy];
733 MFEM_FOREACH_THREAD(qy,y,Q1D)
735 MFEM_FOREACH_THREAD(qx,x,Q1D)
737 const int q = (qx + ((qy) * Q1D));
738 const real_t O11 = D(q,0,e);
739 const real_t O21 = D(q,1,e);
740 const real_t O12 = symmetric ? O21 : D(q,2,e);
741 const real_t O22 = symmetric ? D(q,2,e) : D(q,3,e);
742 const real_t gX = QQ0[qy][qx];
743 const real_t gY = QQ1[qy][qx];
744 QQ0[qy][qx] = (O11 * gX) + (O12 * gY);
745 QQ1[qy][qx] = (O21 * gX) + (O22 * gY);
751 MFEM_FOREACH_THREAD(dy,y,D1D)
753 MFEM_FOREACH_THREAD(q,x,Q1D)
761 MFEM_FOREACH_THREAD(qy,y,Q1D)
763 MFEM_FOREACH_THREAD(dx,x,D1D)
767 for (
int qx = 0; qx < Q1D; ++qx)
769 u += Gt[dx][qx] * QQ0[qy][qx];
770 v += Bt[dx][qx] * QQ1[qy][qx];
777 MFEM_FOREACH_THREAD(dy,y,D1D)
779 MFEM_FOREACH_THREAD(dx,x,D1D)
783 for (
int qy = 0; qy < Q1D; ++qy)
785 u += DQ0[dx][qy] * Bt[dy][qy];
786 v += DQ1[dx][qy] * Gt[dy][qy];
788 Y(dx,dy,e) += (
u + v);
795template<
int T_D1D = 0,
int T_Q1D = 0>
796inline void PADiffusionApply3D(
const int NE,
797 const bool symmetric,
798 const Array<real_t> &
b,
799 const Array<real_t> &g,
800 const Array<real_t> &bt,
801 const Array<real_t> >,
805 int d1d = 0,
int q1d = 0)
807 const int D1D = T_D1D ? T_D1D : d1d;
808 const int Q1D = T_Q1D ? T_Q1D : q1d;
811 auto B =
Reshape(
b.Read(), Q1D, D1D);
812 auto G =
Reshape(g.Read(), Q1D, D1D);
813 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
814 auto Gt =
Reshape(gt.Read(), D1D, Q1D);
815 auto D =
Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
816 auto X =
Reshape(x_.Read(), D1D, D1D, D1D, NE);
817 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
820 const int D1D = T_D1D ? T_D1D : d1d;
821 const int Q1D = T_Q1D ? T_Q1D : q1d;
822 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
823 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
824 real_t grad[max_Q1D][max_Q1D][max_Q1D][3];
825 for (
int qz = 0; qz < Q1D; ++qz)
827 for (
int qy = 0; qy < Q1D; ++qy)
829 for (
int qx = 0; qx < Q1D; ++qx)
831 grad[qz][qy][qx][0] = 0.0;
832 grad[qz][qy][qx][1] = 0.0;
833 grad[qz][qy][qx][2] = 0.0;
837 for (
int dz = 0; dz < D1D; ++dz)
839 real_t gradXY[max_Q1D][max_Q1D][3];
840 for (
int qy = 0; qy < Q1D; ++qy)
842 for (
int qx = 0; qx < Q1D; ++qx)
844 gradXY[qy][qx][0] = 0.0;
845 gradXY[qy][qx][1] = 0.0;
846 gradXY[qy][qx][2] = 0.0;
849 for (
int dy = 0; dy < D1D; ++dy)
852 for (
int qx = 0; qx < Q1D; ++qx)
857 for (
int dx = 0; dx < D1D; ++dx)
859 const real_t s = X(dx,dy,dz,e);
860 for (
int qx = 0; qx < Q1D; ++qx)
862 gradX[qx][0] += s * B(qx,dx);
863 gradX[qx][1] += s * G(qx,dx);
866 for (
int qy = 0; qy < Q1D; ++qy)
868 const real_t wy = B(qy,dy);
869 const real_t wDy = G(qy,dy);
870 for (
int qx = 0; qx < Q1D; ++qx)
872 const real_t wx = gradX[qx][0];
873 const real_t wDx = gradX[qx][1];
874 gradXY[qy][qx][0] += wDx * wy;
875 gradXY[qy][qx][1] += wx * wDy;
876 gradXY[qy][qx][2] += wx * wy;
880 for (
int qz = 0; qz < Q1D; ++qz)
882 const real_t wz = B(qz,dz);
883 const real_t wDz = G(qz,dz);
884 for (
int qy = 0; qy < Q1D; ++qy)
886 for (
int qx = 0; qx < Q1D; ++qx)
888 grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
889 grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
890 grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
896 for (
int qz = 0; qz < Q1D; ++qz)
898 for (
int qy = 0; qy < Q1D; ++qy)
900 for (
int qx = 0; qx < Q1D; ++qx)
902 const int q = qx + (qy + qz * Q1D) * Q1D;
903 const real_t O11 = D(q,0,e);
904 const real_t O12 = D(q,1,e);
905 const real_t O13 = D(q,2,e);
906 const real_t O21 = symmetric ? O12 : D(q,3,e);
907 const real_t O22 = symmetric ? D(q,3,e) : D(q,4,e);
908 const real_t O23 = symmetric ? D(q,4,e) : D(q,5,e);
909 const real_t O31 = symmetric ? O13 : D(q,6,e);
910 const real_t O32 = symmetric ? O23 : D(q,7,e);
911 const real_t O33 = symmetric ? D(q,5,e) : D(q,8,e);
912 const real_t gradX = grad[qz][qy][qx][0];
913 const real_t gradY = grad[qz][qy][qx][1];
914 const real_t gradZ = grad[qz][qy][qx][2];
915 grad[qz][qy][qx][0] = (O11*gradX)+(O12*gradY)+(O13*gradZ);
916 grad[qz][qy][qx][1] = (O21*gradX)+(O22*gradY)+(O23*gradZ);
917 grad[qz][qy][qx][2] = (O31*gradX)+(O32*gradY)+(O33*gradZ);
921 for (
int qz = 0; qz < Q1D; ++qz)
923 real_t gradXY[max_D1D][max_D1D][3];
924 for (
int dy = 0; dy < D1D; ++dy)
926 for (
int dx = 0; dx < D1D; ++dx)
928 gradXY[dy][dx][0] = 0;
929 gradXY[dy][dx][1] = 0;
930 gradXY[dy][dx][2] = 0;
933 for (
int qy = 0; qy < Q1D; ++qy)
936 for (
int dx = 0; dx < D1D; ++dx)
942 for (
int qx = 0; qx < Q1D; ++qx)
944 const real_t gX = grad[qz][qy][qx][0];
945 const real_t gY = grad[qz][qy][qx][1];
946 const real_t gZ = grad[qz][qy][qx][2];
947 for (
int dx = 0; dx < D1D; ++dx)
949 const real_t wx = Bt(dx,qx);
950 const real_t wDx = Gt(dx,qx);
951 gradX[dx][0] += gX * wDx;
952 gradX[dx][1] += gY * wx;
953 gradX[dx][2] += gZ * wx;
956 for (
int dy = 0; dy < D1D; ++dy)
958 const real_t wy = Bt(dy,qy);
959 const real_t wDy = Gt(dy,qy);
960 for (
int dx = 0; dx < D1D; ++dx)
962 gradXY[dy][dx][0] += gradX[dx][0] * wy;
963 gradXY[dy][dx][1] += gradX[dx][1] * wDy;
964 gradXY[dy][dx][2] += gradX[dx][2] * wy;
968 for (
int dz = 0; dz < D1D; ++dz)
970 const real_t wz = Bt(dz,qz);
971 const real_t wDz = Gt(dz,qz);
972 for (
int dy = 0; dy < D1D; ++dy)
974 for (
int dx = 0; dx < D1D; ++dx)
977 ((gradXY[dy][dx][0] * wz) +
978 (gradXY[dy][dx][1] * wz) +
979 (gradXY[dy][dx][2] * wDz));
988template<
int T_D1D = 0,
int T_Q1D = 0>
989inline void SmemPADiffusionApply3D(
const int NE,
990 const bool symmetric,
991 const Array<real_t> &b_,
992 const Array<real_t> &g_,
993 const Array<real_t> &,
994 const Array<real_t> &,
1001 const int D1D = T_D1D ? T_D1D : d1d;
1002 const int Q1D = T_Q1D ? T_Q1D : q1d;
1005 MFEM_VERIFY(D1D <= max_d1d,
"");
1006 MFEM_VERIFY(Q1D <= max_q1d,
"");
1007 auto b =
Reshape(b_.Read(), Q1D, D1D);
1008 auto g =
Reshape(g_.Read(), Q1D, D1D);
1009 auto d =
Reshape(d_.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
1010 auto x =
Reshape(x_.Read(), D1D, D1D, D1D, NE);
1011 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1012 MFEM_VERIFY(D1D <= Q1D,
"THREAD_DIRECT requires D1D <= Q1D");
1015 const int D1D = T_D1D ? T_D1D : d1d;
1016 const int Q1D = T_Q1D ? T_Q1D : q1d;
1017 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1018 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1019 constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
1020 MFEM_SHARED
real_t sBG[2][MQ1*MD1];
1025 MFEM_SHARED
real_t sm0[3][MDQ*MDQ*MDQ];
1026 MFEM_SHARED
real_t sm1[3][MDQ*MDQ*MDQ];
1027 real_t (*X)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+2);
1028 real_t (*DDQ0)[MD1][MQ1] = (
real_t (*)[MD1][MQ1]) (sm0+0);
1029 real_t (*DDQ1)[MD1][MQ1] = (
real_t (*)[MD1][MQ1]) (sm0+1);
1030 real_t (*DQQ0)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+0);
1031 real_t (*DQQ1)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+1);
1032 real_t (*DQQ2)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm1+2);
1033 real_t (*QQQ0)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+0);
1034 real_t (*QQQ1)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+1);
1035 real_t (*QQQ2)[MQ1][MQ1] = (
real_t (*)[MQ1][MQ1]) (sm0+2);
1036 real_t (*QQD0)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+0);
1037 real_t (*QQD1)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+1);
1038 real_t (*QQD2)[MQ1][MD1] = (
real_t (*)[MQ1][MD1]) (sm1+2);
1039 real_t (*QDD0)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+0);
1040 real_t (*QDD1)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+1);
1041 real_t (*QDD2)[MD1][MD1] = (
real_t (*)[MD1][MD1]) (sm0+2);
1042 MFEM_FOREACH_THREAD_DIRECT(dz,z,D1D)
1044 MFEM_FOREACH_THREAD_DIRECT(dy,y,D1D)
1046 MFEM_FOREACH_THREAD_DIRECT(dx,x,D1D)
1048 X[dz][dy][dx] = x(dx,dy,dz,e);
1052 if (MFEM_THREAD_ID(z) == 0)
1054 MFEM_FOREACH_THREAD_DIRECT(dy,y,D1D)
1056 MFEM_FOREACH_THREAD_DIRECT(qx,x,Q1D)
1058 B[qx][dy] =
b(qx,dy);
1059 G[qx][dy] = g(qx,dy);
1064 MFEM_FOREACH_THREAD_DIRECT(dz,z,D1D)
1066 MFEM_FOREACH_THREAD_DIRECT(dy,y,D1D)
1068 MFEM_FOREACH_THREAD_DIRECT(qx,x,Q1D)
1072 for (
int dx = 0; dx < D1D; ++dx)
1074 const real_t coords = X[dz][dy][dx];
1075 u += coords * B[qx][dx];
1076 v += coords * G[qx][dx];
1078 DDQ0[dz][dy][qx] =
u;
1079 DDQ1[dz][dy][qx] = v;
1084 MFEM_FOREACH_THREAD_DIRECT(dz,z,D1D)
1086 MFEM_FOREACH_THREAD_DIRECT(qy,y,Q1D)
1088 MFEM_FOREACH_THREAD_DIRECT(qx,x,Q1D)
1090 real_t u = 0.0, v = 0.0, w = 0.0;
1092 for (
int dy = 0; dy < D1D; ++dy)
1094 u += DDQ1[dz][dy][qx] * B[qy][dy];
1095 v += DDQ0[dz][dy][qx] * G[qy][dy];
1096 w += DDQ0[dz][dy][qx] * B[qy][dy];
1098 DQQ0[dz][qy][qx] =
u;
1099 DQQ1[dz][qy][qx] = v;
1100 DQQ2[dz][qy][qx] = w;
1105 MFEM_FOREACH_THREAD_DIRECT(qz,z,Q1D)
1107 MFEM_FOREACH_THREAD_DIRECT(qy,y,Q1D)
1109 MFEM_FOREACH_THREAD_DIRECT(qx,x,Q1D)
1111 real_t u = 0.0, v = 0.0, w = 0.0;
1113 for (
int dz = 0; dz < D1D; ++dz)
1115 u += DQQ0[dz][qy][qx] * B[qz][dz];
1116 v += DQQ1[dz][qy][qx] * B[qz][dz];
1117 w += DQQ2[dz][qy][qx] * G[qz][dz];
1119 const real_t O11 = d(qx,qy,qz,0,e);
1120 const real_t O12 = d(qx,qy,qz,1,e);
1121 const real_t O13 = d(qx,qy,qz,2,e);
1122 const real_t O21 = symmetric ? O12 : d(qx,qy,qz,3,e);
1123 const real_t O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e);
1124 const real_t O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e);
1125 const real_t O31 = symmetric ? O13 : d(qx,qy,qz,6,e);
1126 const real_t O32 = symmetric ? O23 : d(qx,qy,qz,7,e);
1127 const real_t O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e);
1131 QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ);
1132 QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ);
1133 QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ);
1138 if (MFEM_THREAD_ID(z) == 0)
1140 MFEM_FOREACH_THREAD_DIRECT(dy,y,D1D)
1142 MFEM_FOREACH_THREAD_DIRECT(qx,x,Q1D)
1144 Bt[dy][qx] =
b(qx,dy);
1145 Gt[dy][qx] = g(qx,dy);
1150 MFEM_FOREACH_THREAD_DIRECT(qz,z,Q1D)
1152 MFEM_FOREACH_THREAD_DIRECT(qy,y,Q1D)
1154 MFEM_FOREACH_THREAD_DIRECT(dx,x,D1D)
1156 real_t u = 0.0, v = 0.0, w = 0.0;
1158 for (
int qx = 0; qx < Q1D; ++qx)
1160 u += QQQ0[qz][qy][qx] * Gt[dx][qx];
1161 v += QQQ1[qz][qy][qx] * Bt[dx][qx];
1162 w += QQQ2[qz][qy][qx] * Bt[dx][qx];
1164 QQD0[qz][qy][dx] =
u;
1165 QQD1[qz][qy][dx] = v;
1166 QQD2[qz][qy][dx] = w;
1171 MFEM_FOREACH_THREAD_DIRECT(qz,z,Q1D)
1173 MFEM_FOREACH_THREAD_DIRECT(dy,y,D1D)
1175 MFEM_FOREACH_THREAD_DIRECT(dx,x,D1D)
1177 real_t u = 0.0, v = 0.0, w = 0.0;
1179 for (
int qy = 0; qy < Q1D; ++qy)
1181 u += QQD0[qz][qy][dx] * Bt[dy][qy];
1182 v += QQD1[qz][qy][dx] * Gt[dy][qy];
1183 w += QQD2[qz][qy][dx] * Bt[dy][qy];
1185 QDD0[qz][dy][dx] =
u;
1186 QDD1[qz][dy][dx] = v;
1187 QDD2[qz][dy][dx] = w;
1192 MFEM_FOREACH_THREAD_DIRECT(dz,z,D1D)
1194 MFEM_FOREACH_THREAD_DIRECT(dy,y,D1D)
1196 MFEM_FOREACH_THREAD_DIRECT(dx,x,D1D)
1198 real_t u = 0.0, v = 0.0, w = 0.0;
1200 for (
int qz = 0; qz < Q1D; ++qz)
1202 u += QDD0[qz][dy][dx] * Bt[dz][qz];
1203 v += QDD1[qz][dy][dx] * Bt[dz][qz];
1204 w += QDD2[qz][dy][dx] * Gt[dz][qz];
1206 y(dx,dy,dz,e) += (
u + v + w);
1221template<
int DIM,
int T_D1D,
int T_Q1D>
1222ApplyKernelType DiffusionIntegrator::ApplyPAKernels::Kernel()
1224 if constexpr (
DIM == 2) {
return internal::SmemPADiffusionApply2D<T_D1D,T_Q1D>; }
1225 else if constexpr (
DIM == 3) {
return internal::SmemPADiffusionApply3D<T_D1D, T_Q1D>; }
1230ApplyKernelType DiffusionIntegrator::ApplyPAKernels::Fallback(
int DIM,
int,
int)
1232 if (
DIM == 2) {
return internal::PADiffusionApply2D; }
1233 else if (
DIM == 3) {
return internal::PADiffusionApply3D; }
1234 else { MFEM_ABORT(
""); }
1237template<
int DIM,
int D1D,
int Q1D>
1238DiagonalKernelType DiffusionIntegrator::DiagonalPAKernels::Kernel()
1240 if constexpr (
DIM == 2) {
return internal::SmemPADiffusionDiagonal2D<D1D,Q1D>; }
1241 else if constexpr (
DIM == 3) {
return internal::SmemPADiffusionDiagonal3D<D1D, Q1D>; }
1245inline DiagonalKernelType
1246DiffusionIntegrator::DiagonalPAKernels::Fallback(
int DIM,
int,
int)
1248 if (
DIM == 2) {
return internal::PADiffusionDiagonal2D; }
1249 else if (
DIM == 3) {
return internal::PADiffusionDiagonal3D; }
1250 else { MFEM_ABORT(
""); }
void(*)(const int, const bool, const Array< real_t > &, const Array< real_t > &, const Vector &, Vector &, const int, const int) DiagonalKernelType
void(*)(const int, const bool, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
real_t u(const Vector &xvec)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
void forall(int N, lambda &&body)
real_t p(const Vector &x, real_t t)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
int MAX_D1D
Maximum number of 1D nodal points.
int MAX_Q1D
Maximum number of 1D quadrature points.