12#ifndef MFEM_BILININTEG_CONVECTION_KERNELS_HPP
13#define MFEM_BILININTEG_CONVECTION_KERNELS_HPP
26template<
int T_D1D = 0,
int T_Q1D = 0>
static
27void PAConvectionApply2D(
const int ne,
28 const Array<real_t> &
b,
29 const Array<real_t> &g,
30 const Array<real_t> &bt,
31 const Array<real_t> >,
39 const int D1D = T_D1D ? T_D1D : d1d;
40 const int Q1D = T_Q1D ? T_Q1D : q1d;
43 auto B =
Reshape(
b.Read(), Q1D, D1D);
44 auto G =
Reshape(g.Read(), Q1D, D1D);
45 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
46 auto op =
Reshape(op_.Read(), Q1D, Q1D, 2, NE);
47 auto x =
Reshape(x_.Read(), D1D, D1D, NE);
48 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
51 const int D1D = T_D1D ? T_D1D : d1d;
52 const int Q1D = T_Q1D ? T_Q1D : q1d;
54 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
55 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
58 for (
int dy = 0; dy < D1D; ++dy)
60 for (
int dx = 0; dx < D1D; ++dx)
62 u[dy][dx] = x(dx,dy,e);
65 real_t Bu[max_D1D][max_Q1D];
66 real_t Gu[max_D1D][max_Q1D];
67 for (
int dy = 0; dy < D1D; ++dy)
69 for (
int qx = 0; qx < Q1D; ++qx)
73 for (
int dx = 0; dx < D1D; ++dx)
75 const real_t bx = B(qx,dx);
76 const real_t gx = G(qx,dx);
83 real_t GBu[max_Q1D][max_Q1D];
84 real_t BGu[max_Q1D][max_Q1D];
85 for (
int qx = 0; qx < Q1D; ++qx)
87 for (
int qy = 0; qy < Q1D; ++qy)
91 for (
int dy = 0; dy < D1D; ++dy)
93 const real_t bx = B(qy,dy);
94 const real_t gx = G(qy,dy);
95 GBu[qy][qx] += gx * Bu[dy][qx];
96 BGu[qy][qx] += bx * Gu[dy][qx];
101 real_t DGu[max_Q1D][max_Q1D];
102 for (
int qy = 0; qy < Q1D; ++qy)
104 for (
int qx = 0; qx < Q1D; ++qx)
106 const real_t O1 = op(qx,qy,0,e);
107 const real_t O2 = op(qx,qy,1,e);
109 const real_t gradX = BGu[qy][qx];
110 const real_t gradY = GBu[qy][qx];
112 DGu[qy][qx] = (O1 * gradX) + (O2 * gradY);
115 real_t BDGu[max_D1D][max_Q1D];
116 for (
int qx = 0; qx < Q1D; ++qx)
118 for (
int dy = 0; dy < D1D; ++dy)
121 for (
int qy = 0; qy < Q1D; ++qy)
123 const real_t w = Bt(dy,qy);
124 BDGu[dy][qx] += w * DGu[qy][qx];
128 for (
int dx = 0; dx < D1D; ++dx)
130 for (
int dy = 0; dy < D1D; ++dy)
133 for (
int qx = 0; qx < Q1D; ++qx)
135 const real_t w = Bt(dx,qx);
136 BBDGu += w * BDGu[dy][qx];
145template<
int T_D1D = 0,
int T_Q1D = 0,
int T_NBZ = 0>
static
146void SmemPAConvectionApply2D(
const int ne,
147 const Array<real_t> &
b,
148 const Array<real_t> &g,
149 const Array<real_t> &bt,
150 const Array<real_t> >,
158 const int D1D = T_D1D ? T_D1D : d1d;
159 const int Q1D = T_Q1D ? T_Q1D : q1d;
160 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
163 auto B =
Reshape(
b.Read(), Q1D, D1D);
164 auto G =
Reshape(g.Read(), Q1D, D1D);
165 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
166 auto op =
Reshape(op_.Read(), Q1D, Q1D, 2, NE);
167 auto x =
Reshape(x_.Read(), D1D, D1D, NE);
168 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
171 const int tidz = MFEM_THREAD_ID(z);
172 const int D1D = T_D1D ? T_D1D : d1d;
173 const int Q1D = T_Q1D ? T_Q1D : q1d;
175 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
176 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
177 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
179 MFEM_SHARED
real_t u[NBZ][max_D1D][max_D1D];
180 MFEM_FOREACH_THREAD(dy,y,D1D)
182 MFEM_FOREACH_THREAD(dx,x,D1D)
185 u[tidz][dy][dx] = x(dx,dy,e);
189 MFEM_SHARED
real_t Bu[NBZ][max_D1D][max_Q1D];
190 MFEM_SHARED
real_t Gu[NBZ][max_D1D][max_Q1D];
191 MFEM_FOREACH_THREAD(dy,y,D1D)
193 MFEM_FOREACH_THREAD(qx,x,Q1D)
195 Bu[tidz][dy][qx] = 0.0;
196 Gu[tidz][dy][qx] = 0.0;
197 for (
int dx = 0; dx < D1D; ++dx)
199 const real_t bx = B(qx,dx);
200 const real_t gx = G(qx,dx);
201 const real_t x =
u[tidz][dy][dx];
202 Bu[tidz][dy][qx] += bx * x;
203 Gu[tidz][dy][qx] += gx * x;
208 MFEM_SHARED
real_t GBu[NBZ][max_Q1D][max_Q1D];
209 MFEM_SHARED
real_t BGu[NBZ][max_Q1D][max_Q1D];
210 MFEM_FOREACH_THREAD(qx,x,Q1D)
212 MFEM_FOREACH_THREAD(qy,y,Q1D)
214 GBu[tidz][qy][qx] = 0.0;
215 BGu[tidz][qy][qx] = 0.0;
216 for (
int dy = 0; dy < D1D; ++dy)
218 const real_t bx = B(qy,dy);
219 const real_t gx = G(qy,dy);
220 GBu[tidz][qy][qx] += gx * Bu[tidz][dy][qx];
221 BGu[tidz][qy][qx] += bx * Gu[tidz][dy][qx];
227 MFEM_SHARED
real_t DGu[NBZ][max_Q1D][max_Q1D];
228 MFEM_FOREACH_THREAD(qy,y,Q1D)
230 MFEM_FOREACH_THREAD(qx,x,Q1D)
232 const real_t O1 = op(qx,qy,0,e);
233 const real_t O2 = op(qx,qy,1,e);
235 const real_t gradX = BGu[tidz][qy][qx];
236 const real_t gradY = GBu[tidz][qy][qx];
238 DGu[tidz][qy][qx] = (O1 * gradX) + (O2 * gradY);
242 MFEM_SHARED
real_t BDGu[NBZ][max_D1D][max_Q1D];
243 MFEM_FOREACH_THREAD(qx,x,Q1D)
245 MFEM_FOREACH_THREAD(dy,y,D1D)
247 BDGu[tidz][dy][qx] = 0.0;
248 for (
int qy = 0; qy < Q1D; ++qy)
250 const real_t w = Bt(dy,qy);
251 BDGu[tidz][dy][qx] += w * DGu[tidz][qy][qx];
256 MFEM_FOREACH_THREAD(dx,x,D1D)
258 MFEM_FOREACH_THREAD(dy,y,D1D)
261 for (
int qx = 0; qx < Q1D; ++qx)
263 const real_t w = Bt(dx,qx);
264 BBDGu += w * BDGu[tidz][dy][qx];
273template<
int T_D1D = 0,
int T_Q1D = 0>
static
274void PAConvectionApply3D(
const int ne,
275 const Array<real_t> &
b,
276 const Array<real_t> &g,
277 const Array<real_t> &bt,
278 const Array<real_t> >,
286 const int D1D = T_D1D ? T_D1D : d1d;
287 const int Q1D = T_Q1D ? T_Q1D : q1d;
290 auto B =
Reshape(
b.Read(), Q1D, D1D);
291 auto G =
Reshape(g.Read(), Q1D, D1D);
292 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
293 auto op =
Reshape(op_.Read(), Q1D, Q1D, Q1D, 3, NE);
294 auto x =
Reshape(x_.Read(), D1D, D1D, D1D, NE);
295 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
298 const int D1D = T_D1D ? T_D1D : d1d;
299 const int Q1D = T_Q1D ? T_Q1D : q1d;
301 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
302 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
304 real_t u[max_D1D][max_D1D][max_D1D];
305 for (
int dz = 0; dz < D1D; ++dz)
307 for (
int dy = 0; dy < D1D; ++dy)
309 for (
int dx = 0; dx < D1D; ++dx)
311 u[dz][dy][dx] = x(dx,dy,dz,e);
315 real_t Bu[max_D1D][max_D1D][max_Q1D];
316 real_t Gu[max_D1D][max_D1D][max_Q1D];
317 for (
int dz = 0; dz < D1D; ++dz)
319 for (
int dy = 0; dy < D1D; ++dy)
321 for (
int qx = 0; qx < Q1D; ++qx)
323 Bu[dz][dy][qx] = 0.0;
324 Gu[dz][dy][qx] = 0.0;
325 for (
int dx = 0; dx < D1D; ++dx)
327 const real_t bx = B(qx,dx);
328 const real_t gx = G(qx,dx);
329 const real_t x =
u[dz][dy][dx];
330 Bu[dz][dy][qx] += bx * x;
331 Gu[dz][dy][qx] += gx * x;
336 real_t BBu[max_D1D][max_Q1D][max_Q1D];
337 real_t GBu[max_D1D][max_Q1D][max_Q1D];
338 real_t BGu[max_D1D][max_Q1D][max_Q1D];
339 for (
int dz = 0; dz < D1D; ++dz)
341 for (
int qx = 0; qx < Q1D; ++qx)
343 for (
int qy = 0; qy < Q1D; ++qy)
345 BBu[dz][qy][qx] = 0.0;
346 GBu[dz][qy][qx] = 0.0;
347 BGu[dz][qy][qx] = 0.0;
348 for (
int dy = 0; dy < D1D; ++dy)
350 const real_t bx = B(qy,dy);
351 const real_t gx = G(qy,dy);
352 BBu[dz][qy][qx] += bx * Bu[dz][dy][qx];
353 GBu[dz][qy][qx] += gx * Bu[dz][dy][qx];
354 BGu[dz][qy][qx] += bx * Gu[dz][dy][qx];
359 real_t GBBu[max_Q1D][max_Q1D][max_Q1D];
360 real_t BGBu[max_Q1D][max_Q1D][max_Q1D];
361 real_t BBGu[max_Q1D][max_Q1D][max_Q1D];
362 for (
int qx = 0; qx < Q1D; ++qx)
364 for (
int qy = 0; qy < Q1D; ++qy)
366 for (
int qz = 0; qz < Q1D; ++qz)
368 GBBu[qz][qy][qx] = 0.0;
369 BGBu[qz][qy][qx] = 0.0;
370 BBGu[qz][qy][qx] = 0.0;
371 for (
int dz = 0; dz < D1D; ++dz)
373 const real_t bx = B(qz,dz);
374 const real_t gx = G(qz,dz);
375 GBBu[qz][qy][qx] += gx * BBu[dz][qy][qx];
376 BGBu[qz][qy][qx] += bx * GBu[dz][qy][qx];
377 BBGu[qz][qy][qx] += bx * BGu[dz][qy][qx];
383 real_t DGu[max_Q1D][max_Q1D][max_Q1D];
384 for (
int qz = 0; qz < Q1D; ++qz)
386 for (
int qy = 0; qy < Q1D; ++qy)
388 for (
int qx = 0; qx < Q1D; ++qx)
390 const real_t O1 = op(qx,qy,qz,0,e);
391 const real_t O2 = op(qx,qy,qz,1,e);
392 const real_t O3 = op(qx,qy,qz,2,e);
394 const real_t gradX = BBGu[qz][qy][qx];
395 const real_t gradY = BGBu[qz][qy][qx];
396 const real_t gradZ = GBBu[qz][qy][qx];
398 DGu[qz][qy][qx] = (O1 * gradX) + (O2 * gradY) + (O3 * gradZ);
402 real_t BDGu[max_D1D][max_Q1D][max_Q1D];
403 for (
int qx = 0; qx < Q1D; ++qx)
405 for (
int qy = 0; qy < Q1D; ++qy)
407 for (
int dz = 0; dz < D1D; ++dz)
409 BDGu[dz][qy][qx] = 0.0;
410 for (
int qz = 0; qz < Q1D; ++qz)
412 const real_t w = Bt(dz,qz);
413 BDGu[dz][qy][qx] += w * DGu[qz][qy][qx];
418 real_t BBDGu[max_D1D][max_D1D][max_Q1D];
419 for (
int dz = 0; dz < D1D; ++dz)
421 for (
int qx = 0; qx < Q1D; ++qx)
423 for (
int dy = 0; dy < D1D; ++dy)
425 BBDGu[dz][dy][qx] = 0.0;
426 for (
int qy = 0; qy < Q1D; ++qy)
428 const real_t w = Bt(dy,qy);
429 BBDGu[dz][dy][qx] += w * BDGu[dz][qy][qx];
434 for (
int dz = 0; dz < D1D; ++dz)
436 for (
int dy = 0; dy < D1D; ++dy)
438 for (
int dx = 0; dx < D1D; ++dx)
441 for (
int qx = 0; qx < Q1D; ++qx)
443 const real_t w = Bt(dx,qx);
444 BBBDGu += w * BBDGu[dz][dy][qx];
446 y(dx,dy,dz,e) += BBBDGu;
454template<
int T_D1D = 0,
int T_Q1D = 0>
static
455void SmemPAConvectionApply3D(
const int ne,
456 const Array<real_t> &
b,
457 const Array<real_t> &g,
458 const Array<real_t> &bt,
459 const Array<real_t> >,
467 const int D1D = T_D1D ? T_D1D : d1d;
468 const int Q1D = T_Q1D ? T_Q1D : q1d;
471 auto B =
Reshape(
b.Read(), Q1D, D1D);
472 auto G =
Reshape(g.Read(), Q1D, D1D);
473 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
474 auto op =
Reshape(op_.Read(), Q1D, Q1D, Q1D, 3, NE);
475 auto x =
Reshape(x_.Read(), D1D, D1D, D1D, NE);
476 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
479 const int D1D = T_D1D ? T_D1D : d1d;
480 const int Q1D = T_Q1D ? T_Q1D : q1d;
482 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
483 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
484 constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D;
485 MFEM_SHARED
real_t sm0[max_DQ*max_DQ*max_DQ];
486 MFEM_SHARED
real_t sm1[max_DQ*max_DQ*max_DQ];
487 MFEM_SHARED
real_t sm2[max_DQ*max_DQ*max_DQ];
488 MFEM_SHARED
real_t sm3[max_DQ*max_DQ*max_DQ];
489 MFEM_SHARED
real_t sm4[max_DQ*max_DQ*max_DQ];
490 MFEM_SHARED
real_t sm5[max_DQ*max_DQ*max_DQ];
492 real_t (*
u)[max_D1D][max_D1D] = (
real_t (*)[max_D1D][max_D1D]) sm0;
493 MFEM_FOREACH_THREAD(dz,z,D1D)
495 MFEM_FOREACH_THREAD(dy,y,D1D)
497 MFEM_FOREACH_THREAD(dx,x,D1D)
499 u[dz][dy][dx] = x(dx,dy,dz,e);
504 real_t (*Bu)[max_D1D][max_Q1D] = (
real_t (*)[max_D1D][max_Q1D])sm1;
505 real_t (*Gu)[max_D1D][max_Q1D] = (
real_t (*)[max_D1D][max_Q1D])sm2;
506 MFEM_FOREACH_THREAD(dz,z,D1D)
508 MFEM_FOREACH_THREAD(dy,y,D1D)
510 MFEM_FOREACH_THREAD(qx,x,Q1D)
514 for (
int dx = 0; dx < D1D; ++dx)
516 const real_t bx = B(qx,dx);
517 const real_t gx = G(qx,dx);
518 const real_t x =
u[dz][dy][dx];
522 Bu[dz][dy][qx] = Bu_;
523 Gu[dz][dy][qx] = Gu_;
528 real_t (*BBu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm3;
529 real_t (*GBu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm4;
530 real_t (*BGu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm5;
531 MFEM_FOREACH_THREAD(dz,z,D1D)
533 MFEM_FOREACH_THREAD(qx,x,Q1D)
535 MFEM_FOREACH_THREAD(qy,y,Q1D)
540 for (
int dy = 0; dy < D1D; ++dy)
542 const real_t bx = B(qy,dy);
543 const real_t gx = G(qy,dy);
544 BBu_ += bx * Bu[dz][dy][qx];
545 GBu_ += gx * Bu[dz][dy][qx];
546 BGu_ += bx * Gu[dz][dy][qx];
548 BBu[dz][qy][qx] = BBu_;
549 GBu[dz][qy][qx] = GBu_;
550 BGu[dz][qy][qx] = BGu_;
555 real_t (*GBBu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm0;
556 real_t (*BGBu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm1;
557 real_t (*BBGu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm2;
558 MFEM_FOREACH_THREAD(qx,x,Q1D)
560 MFEM_FOREACH_THREAD(qy,y,Q1D)
562 MFEM_FOREACH_THREAD(qz,z,Q1D)
567 for (
int dz = 0; dz < D1D; ++dz)
569 const real_t bx = B(qz,dz);
570 const real_t gx = G(qz,dz);
571 GBBu_ += gx * BBu[dz][qy][qx];
572 BGBu_ += bx * GBu[dz][qy][qx];
573 BBGu_ += bx * BGu[dz][qy][qx];
575 GBBu[qz][qy][qx] = GBBu_;
576 BGBu[qz][qy][qx] = BGBu_;
577 BBGu[qz][qy][qx] = BBGu_;
582 real_t (*DGu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm3;
583 MFEM_FOREACH_THREAD(qz,z,Q1D)
585 MFEM_FOREACH_THREAD(qy,y,Q1D)
587 MFEM_FOREACH_THREAD(qx,x,Q1D)
589 const real_t O1 = op(qx,qy,qz,0,e);
590 const real_t O2 = op(qx,qy,qz,1,e);
591 const real_t O3 = op(qx,qy,qz,2,e);
593 const real_t gradX = BBGu[qz][qy][qx];
594 const real_t gradY = BGBu[qz][qy][qx];
595 const real_t gradZ = GBBu[qz][qy][qx];
597 DGu[qz][qy][qx] = (O1 * gradX) + (O2 * gradY) + (O3 * gradZ);
602 real_t (*BDGu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm4;
603 MFEM_FOREACH_THREAD(qx,x,Q1D)
605 MFEM_FOREACH_THREAD(qy,y,Q1D)
607 MFEM_FOREACH_THREAD(dz,z,D1D)
610 for (
int qz = 0; qz < Q1D; ++qz)
612 const real_t w = Bt(dz,qz);
613 BDGu_ += w * DGu[qz][qy][qx];
615 BDGu[dz][qy][qx] = BDGu_;
620 real_t (*BBDGu)[max_D1D][max_Q1D] = (
real_t (*)[max_D1D][max_Q1D])sm5;
621 MFEM_FOREACH_THREAD(dz,z,D1D)
623 MFEM_FOREACH_THREAD(qx,x,Q1D)
625 MFEM_FOREACH_THREAD(dy,y,D1D)
628 for (
int qy = 0; qy < Q1D; ++qy)
630 const real_t w = Bt(dy,qy);
631 BBDGu_ += w * BDGu[dz][qy][qx];
633 BBDGu[dz][dy][qx] = BBDGu_;
638 MFEM_FOREACH_THREAD(dz,z,D1D)
640 MFEM_FOREACH_THREAD(dy,y,D1D)
642 MFEM_FOREACH_THREAD(dx,x,D1D)
645 for (
int qx = 0; qx < Q1D; ++qx)
647 const real_t w = Bt(dx,qx);
648 BBBDGu += w * BBDGu[dz][dy][qx];
650 y(dx,dy,dz,e) += BBBDGu;
658template<
int T_D1D = 0,
int T_Q1D = 0>
static
659void PAConvectionApplyT2D(
const int ne,
660 const Array<real_t> &
b,
661 const Array<real_t> &g,
662 const Array<real_t> &bt,
663 const Array<real_t> >,
671 const int D1D = T_D1D ? T_D1D : d1d;
672 const int Q1D = T_Q1D ? T_Q1D : q1d;
675 auto B =
Reshape(
b.Read(), Q1D, D1D);
676 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
677 auto Gt =
Reshape(gt.Read(), D1D, Q1D);
678 auto op =
Reshape(op_.Read(), Q1D, Q1D, 2, NE);
679 auto x =
Reshape(x_.Read(), D1D, D1D, NE);
680 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
683 const int D1D = T_D1D ? T_D1D : d1d;
684 const int Q1D = T_Q1D ? T_Q1D : q1d;
686 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
687 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
690 for (
int dy = 0; dy < D1D; ++dy)
692 for (
int dx = 0; dx < D1D; ++dx)
694 u[dy][dx] = x(dx,dy,e);
697 real_t Bu[max_D1D][max_Q1D];
698 for (
int dy = 0; dy < D1D; ++dy)
700 for (
int qx = 0; qx < Q1D; ++qx)
703 for (
int dx = 0; dx < D1D; ++dx)
705 const real_t bx = B(qx,dx);
707 Bu[dy][qx] += bx * x;
711 real_t BBu[max_Q1D][max_Q1D];
712 for (
int qx = 0; qx < Q1D; ++qx)
714 for (
int qy = 0; qy < Q1D; ++qy)
717 for (
int dy = 0; dy < D1D; ++dy)
719 const real_t bx = B(qy,dy);
720 BBu[qy][qx] += bx * Bu[dy][qx];
725 real_t DBu[max_Q1D][max_Q1D][2];
726 for (
int qy = 0; qy < Q1D; ++qy)
728 for (
int qx = 0; qx < Q1D; ++qx)
730 const real_t O1 = op(qx,qy,0,e);
731 const real_t O2 = op(qx,qy,1,e);
733 const real_t X = BBu[qy][qx];
735 DBu[qy][qx][0] = O1 * X;
736 DBu[qy][qx][1] = O2 * X;
739 real_t GDBu[max_D1D][max_Q1D][2];
740 for (
int qx = 0; qx < Q1D; ++qx)
742 for (
int dy = 0; dy < D1D; ++dy)
744 GDBu[dy][qx][0] = 0.0;
745 GDBu[dy][qx][1] = 0.0;
746 for (
int qy = 0; qy < Q1D; ++qy)
748 const real_t by = Bt(dy,qy);
749 const real_t gy = Gt(dy,qy);
750 GDBu[dy][qx][0] += by * DBu[qy][qx][0];
751 GDBu[dy][qx][1] += gy * DBu[qy][qx][1];
755 for (
int dx = 0; dx < D1D; ++dx)
757 for (
int dy = 0; dy < D1D; ++dy)
760 for (
int qx = 0; qx < Q1D; ++qx)
762 const real_t bx = Bt(dx,qx);
763 const real_t gx = Gt(dx,qx);
764 res += gx * GDBu[dy][qx][0] + bx * GDBu[dy][qx][1];
773template<
int T_D1D = 0,
int T_Q1D = 0,
int T_NBZ = 0>
static
774void SmemPAConvectionApplyT2D(
const int ne,
775 const Array<real_t> &
b,
776 const Array<real_t> &g,
777 const Array<real_t> &bt,
778 const Array<real_t> >,
786 const int D1D = T_D1D ? T_D1D : d1d;
787 const int Q1D = T_Q1D ? T_Q1D : q1d;
788 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
791 auto B =
Reshape(
b.Read(), Q1D, D1D);
792 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
793 auto Gt =
Reshape(gt.Read(), D1D, Q1D);
794 auto op =
Reshape(op_.Read(), Q1D, Q1D, 2, NE);
795 auto x =
Reshape(x_.Read(), D1D, D1D, NE);
796 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
799 const int tidz = MFEM_THREAD_ID(z);
800 const int D1D = T_D1D ? T_D1D : d1d;
801 const int Q1D = T_Q1D ? T_Q1D : q1d;
803 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
804 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
805 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
806 MFEM_SHARED
real_t u[NBZ][max_D1D][max_D1D];
807 MFEM_FOREACH_THREAD(dy,y,D1D)
809 MFEM_FOREACH_THREAD(dx,x,D1D)
812 u[tidz][dy][dx] = x(dx,dy,e);
816 MFEM_SHARED
real_t Bu[NBZ][max_D1D][max_Q1D];
817 MFEM_FOREACH_THREAD(dy,y,D1D)
819 MFEM_FOREACH_THREAD(qx,x,Q1D)
821 Bu[tidz][dy][qx] = 0.0;
822 for (
int dx = 0; dx < D1D; ++dx)
824 const real_t bx = B(qx,dx);
825 const real_t x =
u[tidz][dy][dx];
826 Bu[tidz][dy][qx] += bx * x;
831 MFEM_SHARED
real_t BBu[NBZ][max_Q1D][max_Q1D];
832 MFEM_FOREACH_THREAD(qx,x,Q1D)
834 MFEM_FOREACH_THREAD(qy,y,Q1D)
836 BBu[tidz][qy][qx] = 0.0;
837 for (
int dy = 0; dy < D1D; ++dy)
839 const real_t bx = B(qy,dy);
840 BBu[tidz][qy][qx] += bx * Bu[tidz][dy][qx];
846 MFEM_SHARED
real_t DBu[NBZ][max_Q1D][max_Q1D][2];
847 MFEM_FOREACH_THREAD(qy,y,Q1D)
849 MFEM_FOREACH_THREAD(qx,x,Q1D)
851 const real_t O1 = op(qx,qy,0,e);
852 const real_t O2 = op(qx,qy,1,e);
854 const real_t X = BBu[tidz][qy][qx];
856 DBu[tidz][qy][qx][0] = O1 * X;
857 DBu[tidz][qy][qx][1] = O2 * X;
861 MFEM_SHARED
real_t GDBu[NBZ][max_D1D][max_Q1D][2];
862 MFEM_FOREACH_THREAD(qx,x,Q1D)
864 MFEM_FOREACH_THREAD(dy,y,D1D)
866 GDBu[tidz][dy][qx][0] = 0.0;
867 GDBu[tidz][dy][qx][1] = 0.0;
868 for (
int qy = 0; qy < Q1D; ++qy)
870 const real_t by = Bt(dy,qy);
871 const real_t gy = Gt(dy,qy);
872 GDBu[tidz][dy][qx][0] += by * DBu[tidz][qy][qx][0];
873 GDBu[tidz][dy][qx][1] += gy * DBu[tidz][qy][qx][1];
878 MFEM_FOREACH_THREAD(dx,x,D1D)
880 MFEM_FOREACH_THREAD(dy,y,D1D)
883 for (
int qx = 0; qx < Q1D; ++qx)
885 const real_t bx = Bt(dx,qx);
886 const real_t gx = Gt(dx,qx);
887 res += gx * GDBu[tidz][dy][qx][0] + bx * GDBu[tidz][dy][qx][1];
896template<
int T_D1D = 0,
int T_Q1D = 0>
static
897void PAConvectionApplyT3D(
const int ne,
898 const Array<real_t> &
b,
899 const Array<real_t> &g,
900 const Array<real_t> &bt,
901 const Array<real_t> >,
909 const int D1D = T_D1D ? T_D1D : d1d;
910 const int Q1D = T_Q1D ? T_Q1D : q1d;
913 auto B =
Reshape(
b.Read(), Q1D, D1D);
914 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
915 auto Gt =
Reshape(gt.Read(), D1D, Q1D);
916 auto op =
Reshape(op_.Read(), Q1D, Q1D, Q1D, 3, NE);
917 auto x =
Reshape(x_.Read(), D1D, D1D, D1D, NE);
918 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
921 const int D1D = T_D1D ? T_D1D : d1d;
922 const int Q1D = T_Q1D ? T_Q1D : q1d;
924 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
925 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
927 real_t u[max_D1D][max_D1D][max_D1D];
928 for (
int dz = 0; dz < D1D; ++dz)
930 for (
int dy = 0; dy < D1D; ++dy)
932 for (
int dx = 0; dx < D1D; ++dx)
934 u[dz][dy][dx] = x(dx,dy,dz,e);
938 real_t Bu[max_D1D][max_D1D][max_Q1D];
939 for (
int dz = 0; dz < D1D; ++dz)
941 for (
int dy = 0; dy < D1D; ++dy)
943 for (
int qx = 0; qx < Q1D; ++qx)
945 Bu[dz][dy][qx] = 0.0;
946 for (
int dx = 0; dx < D1D; ++dx)
948 const real_t bx = B(qx,dx);
949 const real_t x =
u[dz][dy][dx];
950 Bu[dz][dy][qx] += bx * x;
955 real_t BBu[max_D1D][max_Q1D][max_Q1D];
956 for (
int dz = 0; dz < D1D; ++dz)
958 for (
int qx = 0; qx < Q1D; ++qx)
960 for (
int qy = 0; qy < Q1D; ++qy)
962 BBu[dz][qy][qx] = 0.0;
963 for (
int dy = 0; dy < D1D; ++dy)
965 const real_t bx = B(qy,dy);
966 BBu[dz][qy][qx] += bx * Bu[dz][dy][qx];
971 real_t BBBu[max_Q1D][max_Q1D][max_Q1D];
972 for (
int qx = 0; qx < Q1D; ++qx)
974 for (
int qy = 0; qy < Q1D; ++qy)
976 for (
int qz = 0; qz < Q1D; ++qz)
978 BBBu[qz][qy][qx] = 0.0;
979 for (
int dz = 0; dz < D1D; ++dz)
981 const real_t bx = B(qz,dz);
982 BBBu[qz][qy][qx] += bx * BBu[dz][qy][qx];
988 real_t DBu[max_Q1D][max_Q1D][max_Q1D][3];
989 for (
int qz = 0; qz < Q1D; ++qz)
991 for (
int qy = 0; qy < Q1D; ++qy)
993 for (
int qx = 0; qx < Q1D; ++qx)
995 const real_t O1 = op(qx,qy,qz,0,e);
996 const real_t O2 = op(qx,qy,qz,1,e);
997 const real_t O3 = op(qx,qy,qz,2,e);
999 const real_t X = BBBu[qz][qy][qx];
1001 DBu[qz][qy][qx][0] = O1 * X;
1002 DBu[qz][qy][qx][1] = O2 * X;
1003 DBu[qz][qy][qx][2] = O3 * X;
1007 real_t GDBu[max_D1D][max_Q1D][max_Q1D][3];
1008 for (
int qx = 0; qx < Q1D; ++qx)
1010 for (
int qy = 0; qy < Q1D; ++qy)
1012 for (
int dz = 0; dz < D1D; ++dz)
1014 GDBu[dz][qy][qx][0] = 0.0;
1015 GDBu[dz][qy][qx][1] = 0.0;
1016 GDBu[dz][qy][qx][2] = 0.0;
1017 for (
int qz = 0; qz < Q1D; ++qz)
1019 const real_t bz = Bt(dz,qz);
1020 const real_t gz = Gt(dz,qz);
1021 GDBu[dz][qy][qx][0] += bz * DBu[qz][qy][qx][0];
1022 GDBu[dz][qy][qx][1] += bz * DBu[qz][qy][qx][1];
1023 GDBu[dz][qy][qx][2] += gz * DBu[qz][qy][qx][2];
1028 real_t GGDBu[max_D1D][max_D1D][max_Q1D][3];
1029 for (
int dz = 0; dz < D1D; ++dz)
1031 for (
int qx = 0; qx < Q1D; ++qx)
1033 for (
int dy = 0; dy < D1D; ++dy)
1035 GGDBu[dz][dy][qx][0] = 0.0;
1036 GGDBu[dz][dy][qx][1] = 0.0;
1037 GGDBu[dz][dy][qx][2] = 0.0;
1038 for (
int qy = 0; qy < Q1D; ++qy)
1040 const real_t by = Bt(dy,qy);
1041 const real_t gy = Gt(dy,qy);
1042 GGDBu[dz][dy][qx][0] += by * GDBu[dz][qy][qx][0];
1043 GGDBu[dz][dy][qx][1] += gy * GDBu[dz][qy][qx][1];
1044 GGDBu[dz][dy][qx][2] += by * GDBu[dz][qy][qx][2];
1049 for (
int dz = 0; dz < D1D; ++dz)
1051 for (
int dy = 0; dy < D1D; ++dy)
1053 for (
int dx = 0; dx < D1D; ++dx)
1056 for (
int qx = 0; qx < Q1D; ++qx)
1058 const real_t bx = Bt(dx,qx);
1059 const real_t gx = Gt(dx,qx);
1060 res += gx * GGDBu[dz][dy][qx][0];
1061 res += bx * GGDBu[dz][dy][qx][1];
1062 res += bx * GGDBu[dz][dy][qx][2];
1064 y(dx,dy,dz,e) += res;
1072template<
int T_D1D = 0,
int T_Q1D = 0>
static
1073void SmemPAConvectionApplyT3D(
const int ne,
1074 const Array<real_t> &
b,
1075 const Array<real_t> &g,
1076 const Array<real_t> &bt,
1077 const Array<real_t> >,
1085 const int D1D = T_D1D ? T_D1D : d1d;
1086 const int Q1D = T_Q1D ? T_Q1D : q1d;
1089 auto B =
Reshape(
b.Read(), Q1D, D1D);
1090 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
1091 auto Gt =
Reshape(gt.Read(), D1D, Q1D);
1092 auto op =
Reshape(op_.Read(), Q1D, Q1D, Q1D, 3, NE);
1093 auto x =
Reshape(x_.Read(), D1D, D1D, D1D, NE);
1094 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1097 const int D1D = T_D1D ? T_D1D : d1d;
1098 const int Q1D = T_Q1D ? T_Q1D : q1d;
1100 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1101 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1102 constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D;
1103 MFEM_SHARED
real_t sm0[3*max_DQ*max_DQ*max_DQ];
1104 MFEM_SHARED
real_t sm1[3*max_DQ*max_DQ*max_DQ];
1106 real_t (*
u)[max_D1D][max_D1D] = (
real_t (*)[max_D1D][max_D1D]) sm0;
1107 MFEM_FOREACH_THREAD(dz,z,D1D)
1109 MFEM_FOREACH_THREAD(dy,y,D1D)
1111 MFEM_FOREACH_THREAD(dx,x,D1D)
1113 u[dz][dy][dx] = x(dx,dy,dz,e);
1118 real_t (*Bu)[max_D1D][max_Q1D] = (
real_t (*)[max_D1D][max_Q1D])sm1;
1119 MFEM_FOREACH_THREAD(dz,z,D1D)
1121 MFEM_FOREACH_THREAD(dy,y,D1D)
1123 MFEM_FOREACH_THREAD(qx,x,Q1D)
1126 for (
int dx = 0; dx < D1D; ++dx)
1128 const real_t bx = B(qx,dx);
1129 const real_t x =
u[dz][dy][dx];
1132 Bu[dz][dy][qx] = Bu_;
1137 real_t (*BBu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm0;
1138 MFEM_FOREACH_THREAD(dz,z,D1D)
1140 MFEM_FOREACH_THREAD(qx,x,Q1D)
1142 MFEM_FOREACH_THREAD(qy,y,Q1D)
1145 for (
int dy = 0; dy < D1D; ++dy)
1147 const real_t bx = B(qy,dy);
1148 BBu_ += bx * Bu[dz][dy][qx];
1150 BBu[dz][qy][qx] = BBu_;
1155 real_t (*BBBu)[max_Q1D][max_Q1D] = (
real_t (*)[max_Q1D][max_Q1D])sm1;
1156 MFEM_FOREACH_THREAD(qx,x,Q1D)
1158 MFEM_FOREACH_THREAD(qy,y,Q1D)
1160 MFEM_FOREACH_THREAD(qz,z,Q1D)
1163 for (
int dz = 0; dz < D1D; ++dz)
1165 const real_t bx = B(qz,dz);
1166 BBBu_ += bx * BBu[dz][qy][qx];
1168 BBBu[qz][qy][qx] = BBBu_;
1173 real_t (*DBu)[max_Q1D][max_Q1D][3] = (
real_t (*)[max_Q1D][max_Q1D][3])sm0;
1174 MFEM_FOREACH_THREAD(qz,z,Q1D)
1176 MFEM_FOREACH_THREAD(qy,y,Q1D)
1178 MFEM_FOREACH_THREAD(qx,x,Q1D)
1180 const real_t O1 = op(qx,qy,qz,0,e);
1181 const real_t O2 = op(qx,qy,qz,1,e);
1182 const real_t O3 = op(qx,qy,qz,2,e);
1184 const real_t X = BBBu[qz][qy][qx];
1186 DBu[qz][qy][qx][0] = O1 * X;
1187 DBu[qz][qy][qx][1] = O2 * X;
1188 DBu[qz][qy][qx][2] = O3 * X;
1193 real_t (*GDBu)[max_Q1D][max_Q1D][3] = (
real_t (*)[max_Q1D][max_Q1D][3])sm1;
1194 MFEM_FOREACH_THREAD(qx,x,Q1D)
1196 MFEM_FOREACH_THREAD(qy,y,Q1D)
1198 MFEM_FOREACH_THREAD(dz,z,D1D)
1203 for (
int qz = 0; qz < Q1D; ++qz)
1205 const real_t bz = Bt(dz,qz);
1206 const real_t gz = Gt(dz,qz);
1207 GDBu0 += bz * DBu[qz][qy][qx][0];
1208 GDBu1 += bz * DBu[qz][qy][qx][1];
1209 GDBu2 += gz * DBu[qz][qy][qx][2];
1211 GDBu[dz][qy][qx][0] = GDBu0;
1212 GDBu[dz][qy][qx][1] = GDBu1;
1213 GDBu[dz][qy][qx][2] = GDBu2;
1218 real_t (*GGDBu)[max_D1D][max_Q1D][3] = (
real_t (*)[max_D1D][max_Q1D][3])sm0;
1219 MFEM_FOREACH_THREAD(dz,z,D1D)
1221 MFEM_FOREACH_THREAD(qx,x,Q1D)
1223 MFEM_FOREACH_THREAD(dy,y,D1D)
1228 for (
int qy = 0; qy < Q1D; ++qy)
1230 const real_t by = Bt(dy,qy);
1231 const real_t gy = Gt(dy,qy);
1232 GGDBu0 += by * GDBu[dz][qy][qx][0];
1233 GGDBu1 += gy * GDBu[dz][qy][qx][1];
1234 GGDBu2 += by * GDBu[dz][qy][qx][2];
1236 GGDBu[dz][dy][qx][0] = GGDBu0;
1237 GGDBu[dz][dy][qx][1] = GGDBu1;
1238 GGDBu[dz][dy][qx][2] = GGDBu2;
1243 MFEM_FOREACH_THREAD(dz,z,D1D)
1245 MFEM_FOREACH_THREAD(dy,y,D1D)
1247 MFEM_FOREACH_THREAD(dx,x,D1D)
1250 for (
int qx = 0; qx < Q1D; ++qx)
1252 const real_t bx = Bt(dx,qx);
1253 const real_t gx = Gt(dx,qx);
1254 res += gx * GGDBu[dz][dy][qx][0];
1255 res += bx * GGDBu[dz][dy][qx][1];
1256 res += bx * GGDBu[dz][dy][qx][2];
1258 y(dx,dy,dz,e) += res;
1267constexpr int ipow(
int x,
int p) {
return p == 0 ? 1 : x*ipow(x,
p-1); }
1268constexpr int D(
int D1D) {
return (11 - D1D) / 2; }
1269constexpr int NBZ(
int D1D)
1271 return ipow(2, D(D1D) >= 0 ? D(D1D) : 0);
1275template <
int DIM,
int T_D1D,
int T_Q1D>
1277ConvectionIntegrator::ApplyPAKernels::Kernel()
1279 if constexpr (
DIM == 2)
1281 constexpr int T_NBZ = convection::NBZ(T_D1D);
1282 return SmemPAConvectionApply2D<T_D1D, T_Q1D, T_NBZ>;
1284 else if constexpr (
DIM == 3)
1286 return SmemPAConvectionApply3D<T_D1D, T_Q1D>;
1291template <
int DIM,
int T_D1D,
int T_Q1D>
1293ConvectionIntegrator::ApplyPATKernels::Kernel()
1295 if constexpr (
DIM == 2)
1297 constexpr int T_NBZ = convection::NBZ(T_D1D);
1298 return SmemPAConvectionApplyT2D<T_D1D, T_Q1D, T_NBZ>;
1300 else if constexpr (
DIM == 3)
1302 return SmemPAConvectionApplyT3D<T_D1D, T_Q1D>;
void(*)(const int, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
arguments: NE, B, G, Bt, Gt, pa_data, x, y, D1D, Q1D
real_t u(const Vector &xvec)
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
void forall(int N, lambda &&body)
real_t p(const Vector &x, real_t t)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.